def test_multinomial_vector(self): random = RandomStreams(utt.fetch_seed()) n = tensor.lvector() pvals = tensor.matrix() out = random.multinomial(n=n, pvals=pvals) assert out.ndim == 2 f = function([n, pvals], out) n_val = [1, 2, 3] pvals_val = [[.1, .9], [.2, .8], [.3, .7]] pvals_val = numpy.asarray(pvals_val, dtype=config.floatX) seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(n_val, pvals_val) numpy_val0 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val, pvals_val)]) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) val1 = f(n_val[:-1], pvals_val[:-1]) numpy_val1 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val[:-1], pvals_val[:-1])]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = function([n, pvals], random.multinomial(n=n, pvals=pvals, size=(3,))) val2 = g(n_val, pvals_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val, pvals_val)]) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, n_val[:-1], pvals_val[:-1])
class SampleMultinomial(Layer): def __init__(self, from_logits=False, **kwargs): super(SampleMultinomial, self).__init__(**kwargs) self.from_logits = from_logits if K.backend() == 'theano': from theano.tensor.shared_randomstreams import RandomStreams self.random = RandomStreams() elif K.backend() == 'tensorflow': import tensorflow as tf else: raise NotImplementedError def call(self, x, mask=None): if K.backend() == 'theano': if self.from_logits: # TODO: there is a more direct way from logits return K.argmax(self.random.multinomial(pvals=K.softmax(x))) else: return K.argmax(self.random.multinomial(pvals=x)) elif K.backend() == 'tensorflow': import tensorflow as tf shape = K.shape(x) if not self.from_logits: x = tf.clip_by_value(x, K.epsilon(), 1 - K.epsilon()) x = tf.log(x) return K.reshape(tf.multinomial(K.reshape(x, [-1, shape[-1]]), 1), shape[:-1]) else: raise NotImplementedError def compute_output_shape(self, input_shape): return input_shape[:-1]
def prediction(self, h, bias): srng = RandomStreams(seed=42) prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \ self.compute_parameters(h, bias) mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1) v = T.arange(0, mean_x.shape[0]) m_x = mean_x[v, mode] m_y = mean_y[v, mode] s_x = std_x[v, mode] s_y = std_y[v, mode] r = rho[v, mode] # cov = r * (s_x * s_y) normal = srng.normal((h.shape[0], 2)) x = normal[:, 0] y = normal[:, 1] # x_n = T.shape_padright(s_x * x + cov * y + m_x) # y_n = T.shape_padright(s_y * y + cov * x + m_y) x_n = T.shape_padright(m_x + s_x * x) y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1. - r**2))) uniform = srng.uniform((h.shape[0], )) pin = T.shape_padright(T.cast(bernoulli > uniform, floatX)) return T.concatenate([x_n, y_n, pin], axis=1)
def prediction(self, h, bias): srng = RandomStreams(seed=42) prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \ self.compute_parameters(h, bias) mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1) v = T.arange(0, mean_x.shape[0]) m_x = mean_x[v, mode] m_y = mean_y[v, mode] s_x = std_x[v, mode] s_y = std_y[v, mode] r = rho[v, mode] # cov = r * (s_x * s_y) normal = srng.normal((h.shape[0], 2)) x = normal[:, 0] y = normal[:, 1] # x_n = T.shape_padright(s_x * x + cov * y + m_x) # y_n = T.shape_padright(s_y * y + cov * x + m_y) x_n = T.shape_padright(m_x + s_x * x) y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1.-r**2))) uniform = srng.uniform((h.shape[0],)) pin = T.shape_padright(T.cast(bernoulli > uniform, floatX)) return T.concatenate([x_n, y_n, pin], axis=1)
def sparse_sample_multinomial(x, from_logits=False): if K.backend() == 'theano': from theano.tensor.shared_randomstreams import RandomStreams random = RandomStreams() if from_logits: # TODO: there is a more direct way from logits return K.argmax(random.multinomial(pvals=K.softmax(x))) else: return K.argmax(random.multinomial(pvals=x)) elif K.backend() == 'tensorflow': import tensorflow as tf shape = K.shape(x) if not from_logits: x = tf.clip_by_value(x, K.epsilon(), 1 - K.epsilon()) x = tf.log(x) return K.reshape(tf.multinomial(K.reshape(x, (-1, shape[-1])), 1), shape[:-1]) else: raise NotImplementedError
def MDN_output_layer(x, h, y, in_size, out_size, hidden_size, pred): if connect_h_to_o: hiddens = T.concatenate([hidden for hidden in h], axis=2) hidden_out_size = hidden_size * len(h) else: hiddens = h[-1] hidden_out_size = hidden_size mu_linear = Linear(name='mu_linear' + str(pred), input_dim=hidden_out_size, output_dim=out_size * components_size[network_mode]) sigma_linear = Linear(name='sigma_linear' + str(pred), input_dim=hidden_out_size, output_dim=components_size[network_mode]) mixing_linear = Linear(name='mixing_linear' + str(pred), input_dim=hidden_out_size, output_dim=components_size[network_mode]) initialize([mu_linear, sigma_linear, mixing_linear]) mu = mu_linear.apply(hiddens) mu = mu.reshape( (mu.shape[0], mu.shape[1], out_size, components_size[network_mode])) sigma_orig = sigma_linear.apply(hiddens) sigma = T.nnet.softplus(sigma_orig) mixing_orig = mixing_linear.apply(hiddens) e_x = T.exp(mixing_orig - mixing_orig.max(axis=2, keepdims=True)) mixing = e_x / e_x.sum(axis=2, keepdims=True) exponent = -0.5 * T.inv(sigma) * T.sum( (y.dimshuffle(0, 1, 2, 'x') - mu)**2, axis=2) normalizer = (2 * np.pi * sigma) exponent = exponent + T.log(mixing) - (out_size * .5) * T.log(normalizer) # LogSumExp(x) max_exponent = T.max(exponent, axis=2, keepdims=True) mod_exponent = exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent), axis=2, keepdims=True) log_gauss = T.log(gauss_mix) + max_exponent cost = -T.mean(log_gauss) srng = RandomStreams(seed=seed) mixing = mixing_orig * (1 + sampling_bias) sigma = T.nnet.softplus(sigma_orig - sampling_bias) e_x = T.exp(mixing - mixing.max(axis=2, keepdims=True)) mixing = e_x / e_x.sum(axis=2, keepdims=True) component = srng.multinomial(pvals=mixing) component_mean = T.sum(mu * component.dimshuffle(0, 1, 'x', 2), axis=3) component_std = T.sum(sigma * component, axis=2, keepdims=True) linear_output = srng.normal(avg=component_mean, std=component_std) linear_output.name = 'linear_output' return linear_output, cost
def score(self, Y, Y_hat): # TODO fix me later when using IndexSpace assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) state_below, = owner.inputs assert state_below.ndim == 2 # TODO make this more generic like above state_below = state_below.owner.inputs[0].owner.inputs[0] Y = T.argmax(Y, axis = 1) k = self.num_noise_samples if self.noise_prob is None: theano_rng = RandomStreams(seed = self.mlp.rng.randint(2 ** 15)) noise = theano_rng.random_integers(size = (state_below.shape[0], self.num_noise_samples,), low=0, high = self.n_classes - 1) p_n = 1. / self.n_classes p_w = T.nnet.sigmoid((state_below * self.W[:, Y].T).sum(axis=1) + self.b[Y]) p_x = T.nnet.sigmoid((T.concatenate([state_below] * k) * self.W[:, noise.flatten()].T).sum(axis=1) + self.b[noise.flatten()]) # TODO is this reshape necessary? p_x = p_x.reshape((state_below.shape[0], k)) #pos = k * p_n / (p_w + k * p_n) * T.log(p_w) #neg = (p_x / (p_x + k * p_n) * T.log(p_x)).sum(axis=1) else: #import ipdb #ipdb.set_trace() theano_rng = MRG_RandomStreams(max(self.mlp.rng.randint(2 ** 15), 1)) assert self.mlp.batch_size is not None noise = theano_rng.multinomial(pvals = np.tile(self.noise_prob.get_value(), (k * self.mlp.batch_size, 1))) noise = T.argmax(noise, axis = 1) p_n = self.noise_prob p_w = T.nnet.sigmoid((state_below * self.W[:, Y].T).sum(axis=1) + self.b[Y]) p_x = T.nnet.sigmoid((T.concatenate([state_below] * k) * self.W[:, noise.flatten()].T).sum(axis=1) + self.b[noise.flatten()]) p_x = p_x.reshape((state_below.shape[0], k)) pos = k * p_n[Y] / (p_w + k * p_n[Y]) * T.log(p_w) neg = (p_x / (p_x + k * p_n[noise].reshape(p_x.shape)) * T.log(p_x)).sum(axis=1) #return -(pos - neg).mean() return p_w, p_x
def test_multinomial(self): """Test that RandomStreams.multinomial generates the same results as numpy""" # Check over two calls to see if the random state is correctly updated. random = RandomStreams(utt.fetch_seed()) fn = function([], random.multinomial((4,4), 1, [0.1]*10), updates=random.updates()) fn_val0 = fn() fn_val1 = fn() rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30) rng = numpy.random.RandomState(int(rng_seed)) #int() is for 32bit numpy_val0 = rng.multinomial(1, [0.1]*10, size=(4,4)) numpy_val1 = rng.multinomial(1, [0.1]*10, size=(4,4)) assert numpy.all(fn_val0 == numpy_val0) assert numpy.all(fn_val1 == numpy_val1)
def prediction(self, h, bias): srng = RandomStreams(seed=42) prop, mean, std = self.compute_parameters(h, bias) mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1) bs = mean.shape[0] v = T.arange(0, bs) m = mean[v, mode] # (bs, d) s = std[v, mode] # (bs, d) normal = srng.normal((bs, self.n_dim)) # (bs, d) normal_n = m + s * normal return normal_n
def test_multinomial(self): """Test that RandomStreams.multinomial generates the same results as numpy""" # Check over two calls to see if the random state is correctly updated. random = RandomStreams(utt.fetch_seed()) fn = function([], random.multinomial((4, 4), 1, [0.1]*10), updates=random.updates()) fn_val0 = fn() fn_val1 = fn() rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30) rng = numpy.random.RandomState(int(rng_seed)) # int() is for 32bit numpy_val0 = rng.multinomial(1, [0.1]*10, size=(4, 4)) numpy_val1 = rng.multinomial(1, [0.1]*10, size=(4, 4)) assert numpy.all(fn_val0 == numpy_val0) assert numpy.all(fn_val1 == numpy_val1)
def direct_method(var, parm, p, S, seed=233): S = np.asarray(S) reps = var[0].shape[0] rng = RandomStreams(seed=seed) #todo: makes sure it runs on GPU r_u = rng.uniform((1, reps)) p_n = sum(p) prob = [p_i / p_n for p_i in p] v = tt.stack(prob).reshape( (len(p), reps)).T #This happens because of variables as dvectors (rxns, reps) #TODO: check prb_f for nans prb_f = theano.function(var + parm, v) tau_f = theano.function(var + parm, (1 / p_n) * tt.log(1 / r_u)) ran_f = theano.function(var + parm, rng.multinomial(n=1, pvals=v)) def compute(ics, parm, interval): x = ics reps = ics[0].shape[0] time = np.zeros(shape=(1, reps)) out = np.zeros(shape=(reps, interval, len(var) + 1)) #(rep, time, vars+time) for idx in range(interval): args = [x[0], x[1]] + parm time += tau_f(*args) ran_i = ran_f(*args) incr = np.dot(ran_i, S).T x = np.asarray(x) + incr out[:, idx, :] = np.concatenate((time, x)).T return out return compute
def test_default_shape(self): random = RandomStreams(utt.fetch_seed()) f = function([], random.uniform()) g = function([], random.multinomial()) # seed_rng is generator for generating *seeds* for RandomStates seed_rng = numpy.random.RandomState(utt.fetch_seed()) uniform_rng = numpy.random.RandomState(int(seed_rng.randint(2**30))) multinomial_rng = numpy.random.RandomState(int(seed_rng.randint(2**30))) val0 = f() val1 = f() numpy_val0 = uniform_rng.uniform() numpy_val1 = uniform_rng.uniform() assert numpy.allclose(val0, numpy_val0) assert numpy.allclose(val1, numpy_val1) for i in range(10): # every test has 50% chance of passing even with non-matching random states val2 = g() numpy_val2 = multinomial_rng.multinomial(n=1, pvals=[.5, .5]) assert numpy.all(val2 == numpy_val2)
def sample(self, param_dict): p_vals = param_dict['p_vals'] if K.backend() == 'tensorflow': import tensorflow as tf shape = K.shape(p_vals) reshaped_params = K.reshape(p_vals, (-1, self.n_classes)) samples = tf.multinomial(logits=tf.log(reshaped_params), num_samples=1)[:, 0] # a hack to turn it into one-hot onehot = tf.constant(np.eye(self.n_classes, dtype=np.float32)) result = tf.nn.embedding_lookup(onehot, samples) result = K.reshape(result, shape) return result else: from theano.tensor.shared_randomstreams import RandomStreams random = RandomStreams() return random.multinomial(size=K.shape(p_vals)[:-1], n=1, pvals=p_vals, dtype='float32')
def test_default_shape(self): random = RandomStreams(utt.fetch_seed()) f = function([], random.uniform()) g = function([], random.multinomial()) #seed_rng is generator for generating *seeds* for RandomStates seed_rng = numpy.random.RandomState(utt.fetch_seed()) uniform_rng = numpy.random.RandomState(int(seed_rng.randint(2**30))) multinomial_rng = numpy.random.RandomState(int(seed_rng.randint(2**30))) val0 = f() val1 = f() numpy_val0 = uniform_rng.uniform() numpy_val1 = uniform_rng.uniform() assert numpy.allclose(val0, numpy_val0) assert numpy.allclose(val1, numpy_val1) for i in range(10): # every test has 50% chance of passing even with non-matching random states val2 = g() numpy_val2 = multinomial_rng.multinomial(n=1, pvals=[.5, .5]) assert numpy.all(val2 == numpy_val2)
class SRNN(Model): def __init__( self, name, # a string for identifying model. numvis, numhid, numframes, output_type="real", cheating_level=0.0, # cheating by lookig at x_t (instead of x_tm1) numpy_rng=None, theano_rng=None, ): super(SRNN, self).__init__(name=name) # store arguments self.numvis = numvis self.numhid = numhid self.numlayers = 2 self.numframes = numframes self.output_type = output_type self.selectionthreshold = 0.0 self.cheating_level = theano.shared(np.float32(cheating_level)) if not numpy_rng: self.numpy_rng = np.random.RandomState(1) else: self.numpy_rng = numpy_rng if not theano_rng: self.theano_rng = RandomStreams(1) else: self.theano_rng = theano_rng # create input var self.inputs = T.matrix(name="inputs") # set up params self.whh = [ theano.shared(value=np.eye(self.numhid).astype(theano.config.floatX), name="whh0"), theano.shared(value=np.eye(self.numhid).astype(theano.config.floatX), name="whh1"), ] self.whx = [ theano.shared( value=self.numpy_rng.uniform(low=-0.01, high=0.01, size=(self.numhid, self.numvis)).astype( theano.config.floatX ), name="whx0", ), theano.shared( value=self.numpy_rng.uniform(low=-0.01, high=0.01, size=(self.numhid, self.numvis)).astype( theano.config.floatX ), name="whx1", ), ] self.wxh = [ theano.shared( value=self.numpy_rng.uniform(low=-0.01, high=0.01, size=(self.numvis, self.numhid)).astype( theano.config.floatX ), name="wxh0", ), theano.shared( value=self.numpy_rng.uniform(low=-0.01, high=0.01, size=(self.numhid, self.numhid)).astype( theano.config.floatX ), name="wxh1", ), ] self.bhid = [ theano.shared(value=np.zeros(self.numhid, dtype=theano.config.floatX), name="bhid0"), theano.shared(value=np.zeros(self.numhid, dtype=theano.config.floatX), name="bhid1"), ] self.bx = theano.shared(value=np.zeros(self.numvis, dtype=theano.config.floatX), name="bx") self.params = self.whh + self.whx + self.wxh + self.bhid + [self.bx] self._batchsize = self.inputs.shape[0] # reshape input var from 2D [ Bx(NxT) ] to 3D [ TxBxN ] (time, batch, numvis) self._input_frames = self.inputs.reshape( (self._batchsize, self.inputs.shape[1] // self.numvis, self.numvis) ).transpose(1, 0, 2) # one-step prediction, used by sampling function self.hids_t0 = [T.zeros((self._batchsize, self.numhid)), T.zeros((self._batchsize, self.numhid))] self.hids_t1 = [ ReLU(T.dot(self.hids_t0[0], self.whh[0]) + T.dot(self._input_frames[0], self.wxh[0]) + self.bhid[0]) ] self.hids_t1.append( ReLU(T.dot(self.hids_t0[1], self.whh[1]) + T.dot(self.hids_t1[-1], self.wxh[1]) + self.bhid[1]) ) self.x_pred_1 = self.bx for k in range(2): self.x_pred_1 += T.dot(self.hids_t1[k], self.whx[k]) # end of one-step prediction def step(x_tm1, hids_tm1): hids_tm1 = [hids_tm1[:, k * self.numhid : (k + 1) * self.numhid] for k in range(2)] hids_t = [ReLU(T.dot(hids_tm1[0], self.whh[0]) + T.dot(x_tm1, self.wxh[0]) + self.bhid[0])] hids_t.append(ReLU(T.dot(hids_tm1[1], self.whh[1]) + T.dot(hids_t[-1], self.wxh[1]) + self.bhid[1])) x_pred_t = self.bx for k in range(2): x_pred_t += T.dot(hids_t[k], self.whx[k]) return x_pred_t, T.concatenate(hids_t, 1) (self._predictions, self.hids), self.updates = theano.scan( fn=step, sequences=self._input_frames, outputs_info=[None, T.concatenate(self.hids_t0, 1)] ) # set up output prediction if self.output_type == "real": self._prediction = self._predictions[:, :, : self.numvis] elif self.output_type == "binary": self._prediction = sigmoid(self._predictions[:, :, : self.numvis]) elif self.output_type == "softmax": # softmax doesn't support 3d tensors, reshape batch and time axis # together, apply softmax and reshape back to 3d tensor self._prediction = T.nnet.softmax( self._predictions[:, :, : self.numvis].reshape( (self._predictions.shape[0] * self._predictions.shape[1], self.numvis) ) ).reshape((self._predictions.shape[0], self._predictions.shape[1], self.numvis)) else: raise ValueError("unsupported output_type") # set cost self._prediction_for_training = self._prediction[: self.numframes - 1] if self.output_type == "real": self._cost = T.mean((self._prediction_for_training - self._input_frames[1 : self.numframes]) ** 2) self._cost_varlen = T.mean((self._prediction - self._input_frames[1:]) ** 2) elif self.output_type == "binary": self._cost = -T.mean( self._input_frames[1 : self.numframes] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[4 : self.numframes]) * T.log(1.0 - self._prediction) ) self._cost_varlen = -T.mean( self._input_frames[1:] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:]) * T.log(1.0 - self._prediction) ) elif self.output_type == "softmax": self._cost = -T.mean(T.log(self._prediction_for_training) * self._input_frames[1 : self.numframes]) self._cost_varlen = -T.mean(T.log(self._prediction) * self._input_frames[1:]) # set gradients self._grads = T.grad(self._cost, self.params) # theano function for computing cost and grad self.cost = theano.function([self.inputs], self._cost, updates=self.updates) self.grads = theano.function([self.inputs], self._grads, updates=self.updates) # another set of variables # give some time steps of characters and free the model to predict for all the rest. self.inputs_var = T.fmatrix("inputs_var") self.nsteps = T.lscalar("nsteps") givens = {} givens[self.inputs] = T.concatenate( (self.inputs_var[:, : self.numvis], T.zeros((self.inputs_var.shape[0], self.nsteps * self.numvis))), axis=1 ) self.predict = theano.function( [self.inputs_var, theano.Param(self.nsteps, default=self.numframes - 4)], self._prediction.transpose(1, 0, 2).reshape((self.inputs_var.shape[0], self.nsteps * self.numvis)), updates=self.updates, givens=givens, ) def grad(self, x): def get_cudandarray_value(x): if type(x) == theano.sandbox.cuda.CudaNdarray: return np.array(x.__array__()).flatten() else: return x.flatten() return np.concatenate([get_cudandarray_value(g) for g in self.grads(x)]) def sample(self, numcases=1, numframes=10, temperature=1.0): assert self.output_type == "softmax" next_prediction_and_state = theano.function( [self._input_frames, self.hids_0], [self.theano_rng.multinomial(pvals=T.nnet.softmax(self.x_pred_1 / temperature)), self.hids_1], ) preds = np.zeros((numcases, numframes, self.numvis), dtype="float32") preds[:, 0, :] = self.numpy_rng.multinomial(numcases, pvals=np.ones(self.numvis) / np.float(self.numvis)) hids = np.zeros((numcases, self.numhid), dtype="float32") for t in range(1, numframes): nextpredandstate = next_prediction_and_state(preds[:, [t - 1], :], hids) hids = nextpredandstate[1] preds[:, t, :] = nextpredandstate[0] return preds
class MDN(object): """Mixture Density Network """ def __init__(self, input, rng, n_in, n_hiddens, hid_activations, n_out, out_activation, n_components): """Initialize the parameters for the multilayer perceptron :type rng: np.random.RandomState :param rng: a random number generator used to initialize weights :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden_list: list of int :param n_hidden_list: a list of number of units in each hidden layer :type activations_list: list of lambdas :param n_hidden_list: a list of activations used in each hidden layer :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ from theano.tensor.shared_randomstreams import RandomStreams self.srng = RandomStreams(seed=1234) self.input = input # We are dealing with multiple hidden layers MLP layer0 = NetworkLayer(rng=rng, input=input, n_in=n_in, n_out=n_hiddens[0], activation=hid_activations[0]) h_layers = [("hiddenLayer0", layer0)] for i in range(1, len(n_hiddens)): h_layers.append( ( "hiddenLayer%d" % i, NetworkLayer( rng=rng, input=h_layers[i - 1][1].output, n_in=n_hiddens[i - 1], n_out=n_hiddens[i], activation=hid_activations[i], ), ) ) self.__dict__.update(dict(h_layers)) # The output layer gets as input the hidden units # of the hidden layer self.outputLayer = MDNoutputLayer( rng=rng, input=h_layers[-1][1].output, n_in=n_hiddens[-1], n_out=n_out, n_components=n_components ) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = (self.outputLayer.W_sigma ** 2).sum() + (self.outputLayer.W_mixing ** 2).sum() for i in range(len(n_hiddens)): self.L2_sqr += (self.__dict__["hiddenLayer%d" % i].W ** 2).sum() # the parameters of the model are the parameters of the all layers it # is made out of params = self.outputLayer.params for layer in h_layers: params.extend(layer[1].params) self.params = params def set_symbolic_input(self, input): """We use this function to bind a symbolic variable with the input of the network layer. Added to specify that in training time.""" self.input = input # def train(self, x, y, training_loss, learning_rate, def train(self, y, training_loss, learning_rate, n_epochs, train_x, train_y, valid_x, valid_y, batch_size): """Train the MLP using SGD""" index = T.iscalar() # index to a [mini]batch lr = T.scalar() # learning rate symbolic # index.tag.test_value = 1 gparams = [] for param in self.params: gparam = T.grad(training_loss, param) gparams.append(gparam) updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * T.cast(lr, dtype=theano.config.floatX))) try: train_model = theano.function( inputs=[index, lr], outputs=[training_loss], updates=updates, givens={ self.input: train_x[index * batch_size : (index + 1) * batch_size], y: train_y[index * batch_size : (index + 1) * batch_size], }, ) except: import pdb pdb.set_trace() validate_model = theano.function( inputs=[index], outputs=NLL(sigma=self.outputLayer.sigma, mixing=self.outputLayer.mixing, y=y), givens={ self.input: valid_x[index * batch_size : (index + 1) * batch_size], y: valid_y[index * batch_size : (index + 1) * batch_size], }, ) # compute number of minibatches for training and validation n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size validate_MSE = theano.function( inputs=[index], outputs=MSE(self.samples(), y=y), givens={ self.input: valid_x[index * batch_size : (index + 1) * batch_size], y: valid_y[index * batch_size : (index + 1) * batch_size], }, ) print "training..." start_time = time.clock() epoch = 0 total_training_costs = [] total_validation_costs = [] total_validation_MSE = [] lr_time = 0 lr_step = learning_rate / ((train_x.get_value().shape[0] * 1.0 / batch_size) * (n_epochs - 30)) lr_val = learning_rate while epoch < n_epochs: epoch = epoch + 1 epoch_training_costs = [] # import pdb; pdb.set_trace() for minibatch_index in xrange(n_train_batches): # linear annealing after 40 epochs... if epoch > 40: # lr_val = learning_rate / (1.0+lr_time) # lr_time = lr_time + 1 lr_val = lr_val - lr_step else: lr_val = learning_rate loss_value = train_model(minibatch_index, lr_val) epoch_training_costs.append(loss_value) if np.isnan(loss_value): import pdb pdb.set_trace() print "got NaN in NLL" sys.exit(1) this_training_cost = np.mean(epoch_training_costs) this_validation_cost = np.mean([validate_model(i) for i in xrange(n_valid_batches)]) this_validation_MSE = np.mean([validate_MSE(i) for i in xrange(n_valid_batches)]) total_training_costs.append(this_training_cost) total_validation_costs.append(this_validation_cost) total_validation_MSE.append(this_validation_MSE) print "epoch %i, training NLL %f, validation NLL %f, MSE %f" % ( epoch, this_training_cost, this_validation_cost, this_validation_MSE, ) end_time = time.clock() print "Training took %.2f minutes..." % ((end_time - start_time) / 60.0) # return losses and parameters.. return total_training_costs, total_validation_costs, total_validation_MSE def samples(self): component = self.srng.multinomial(pvals=self.outputLayer.mixing) component_std = T.sum(self.outputLayer.sigma * component, axis=1, keepdims=True) samples = self.srng.normal(std=component_std) return samples def save_model(self, filename="MLP.save", output_folder="output_folder"): """ This function pickles the paramaters in a file for later usage """ storage_file = open(os.path.join(output_folder, filename), "wb") cPickle.dump(self, storage_file, protocol=cPickle.HIGHEST_PROTOCOL) storage_file.close() @staticmethod def load_model(filename="MLP.save", output_folder="output_folder"): """ This function loads pickled paramaters from a file """ storage_file = open(os.path.join(output_folder, filename), "rb") model = cPickle.load(storage_file) storage_file.close() return model
def MDN_output_layer(x, h, y, in_size, out_size, hidden_size, pred, task): if connect_h_to_o: if separate_last_hidden: dedicated_last_h = h[-1][:, :, task * dedicated_last_h_size:(task + 1) * dedicated_last_h_size] shared_last_h = h[-1][:, :, len(game_tasks) * dedicated_last_h_size:] shared_last_h_size = hidden_size - len( game_tasks) * dedicated_last_h_size hiddens = T.concatenate([ hidden for hidden in h[0:-1] + [dedicated_last_h] + [shared_last_h] ], axis=2) hidden_out_size = hidden_size * ( len(h) - specialized_layer_num - 1 ) + specialized_hidden_size * specialized_layer_num + dedicated_last_h_size + shared_last_h_size else: hiddens = T.concatenate([hidden for hidden in h], axis=2) hidden_out_size = hidden_size * ( len(h) - specialized_layer_num ) + specialized_hidden_size * specialized_layer_num else: hiddens = h[-1] hidden_out_size = hidden_size mu_linear = Linear(name='mu_linear' + str(pred), input_dim=hidden_out_size, output_dim=out_size * components_size) sigma_linear = Linear(name='sigma_linear' + str(pred), input_dim=hidden_out_size, output_dim=components_size) mixing_linear = Linear(name='mixing_linear' + str(pred), input_dim=hidden_out_size, output_dim=components_size) initialize([mu_linear, sigma_linear, mixing_linear]) mu = mu_linear.apply(hiddens) mu = mu.reshape((mu.shape[0], mu.shape[1], out_size, components_size)) sigma = sigma_linear.apply(hiddens) sigma = T.nnet.softplus(sigma) mixing = mixing_linear.apply(hiddens) # apply softmax to mixing e_x = T.exp(mixing - mixing.max(axis=2, keepdims=True)) mixing = e_x / e_x.sum(axis=2, keepdims=True) # calculate cost exponent = -0.5 * T.inv(sigma) * T.sum( (y.dimshuffle(0, 1, 2, 'x') - mu)**2, axis=2) normalizer = (2 * np.pi * sigma) exponent = exponent + T.log(mixing) - (out_size * .5) * T.log(normalizer) # LogSumExp(x) max_exponent = T.max(exponent, axis=2, keepdims=True) mod_exponent = exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent), axis=2, keepdims=True) log_gauss = T.log(gauss_mix) + max_exponent # multiply by the task ( 0 if the cost is not related to this task, 1 otherwise) if task_specialized: task_index = in_size - len(game_tasks) + task log_gauss = T.mul(log_gauss, T.sub(x[:, :, task_index:task_index + 1], 1)) # mean over the batch, mean over sequence cost = -T.mean(log_gauss, axis=1).mean() # sampling srng = RandomStreams(seed=seed) component = srng.multinomial(pvals=mixing) component_mean = T.sum(mu * component.dimshuffle(0, 1, 'x', 2), axis=3) component_std = T.sum(sigma * component, axis=2, keepdims=True) linear_output = srng.normal(avg=component_mean, std=component_std) linear_output.name = 'linear_output' return linear_output, cost
class SetRBM(object): """ The Restricted Boltzmann Machine learning algorithm. """ def __init__(self, n_visibles, n_hiddens, n_classes, W=None, U=None, b=None, c=None, d=None, learning_rate=0.1, K=1): self.n_visibles = n_visibles self.n_hiddens = n_hiddens self.n_classes = n_classes self.x = T.matrix('x') self.y = T.vector('y') if W is None: W_value = numpy.asarray( numpy.random.normal( loc=0, scale=0.01, size = (n_visibles, n_hiddens)), dtype = theano.config.floatX) W = theano.shared(value=W_value, name='W') if U is None: U_value = numpy.asarray( numpy.random.normal( loc=0, scale=0.01, size = (n_classes, n_hiddens)), dtype = theano.config.floatX) U = theano.shared(value=U_value, name='W') if b is None : b = theano.shared(value=numpy.zeros(n_hiddens, dtype=theano.config.floatX), name='b') if c is None : c = theano.shared(value=numpy.zeros(n_visibles, dtype=theano.config.floatX),name='c') if d is None : d = theano.shared(value=numpy.zeros(n_classes, dtype=theano.config.floatX),name='d') self.W = W self.U = U self.b = b self.c = c self.d = d self.params = [self.W, self.U, self.b, self.c, self.d] self.theano_rng = RandomStreams(numpy.random.randint(2**30)) self.learning_rate = theano.shared(numpy.asarray(learning_rate, dtype=theano.config.floatX)) self.K = K cost, updates = self.__train() self.train = theano.function([self.x, self.y], cost, updates=updates) self.trainables = map(lambda x: x, updates) # TODO need way to compute to marginalize g from y #self.transform = theano.function([self.x], self._mean_g(self.x).sum(0)) self.output = theano.function([self.x], self._output(self.x)) def _free_energy(self, x, y): bias_term = T.dot(y, self.d) + T.dot(x, self.c) softmax_x = T.log(T.exp(self._softminus( T.dot(x, self.W) + self.b)).sum(0)) hidden_term = T.nnet.softplus(T.dot(y, self.U) + softmax_x).sum() return -bias_term - hidden_term def _output(self, x): softmax_x = T.log(T.exp(self._softminus( T.dot(x, self.W) + self.b)).sum(0)) output = -T.nnet.softplus(self.U + softmax_x).sum(1) return T.argmax(T.nnet.softmax(output)) def _softminus(self, x): return x - T.nnet.softplus(x) def _act(self, x, y): return self._softminus(self.b + T.dot(x, self.W)) + T.dot(y, self.U) def _mean_g(self, x, y): act = self._act(x, y) return T.exp(act) / (1. + T.exp(act).sum(0)), 1. / (1. + T.exp(act).sum(0)) def _mean_h(self, g, x): return T.maximum(g, T.nnet.sigmoid(T.dot(x, self.W) + self.b)) def _mean_x(self, h): return T.dot(h, self.W.T) + self.c def _mean_y(self, g): return T.nnet.softmax(T.dot(g, self.U.T).sum(0) + self.d) def _sample_g(self, x, y): g_mean, g_zeros = self._mean_g(x, y) g_mean = T.concatenate((g_zeros.dimshuffle('x', 0), g_mean)) g_sample = self.theano_rng.multinomial(n=1, pvals=g_mean.T, dtype=theano.config.floatX).T[1:] return g_sample def _sample_h(self, g, x): h_mean = self._mean_h(g, x) h_sample = self.theano_rng.binomial(size=h_mean.shape, n=1, p=h_mean, dtype=theano.config.floatX) return h_sample def _sample_x(self, h): x_mean = self._mean_x(h) x_sample = self.theano_rng.binomial(size=x_mean.shape, n=1, p=x_mean, dtype = theano.config.floatX) return x_sample def _sample_y(self, g): y_mean = self._mean_y(g) y_sample = self.theano_rng.multinomial(n=1, pvals=y_mean, dtype = theano.config.floatX) return y_sample def __train(self): nx_samples = self.x ng_samples = self._sample_g(self.x, self.y) for _ in range(self.K): nh_samples = self._sample_h(ng_samples, nx_samples) nx_samples = self._mean_x(nh_samples) ny_samples = self._sample_y(ng_samples) ng_samples = self._sample_g(nx_samples, ny_samples) cost = T.mean(self._free_energy(self.x, self.y)) \ - T.mean(self._free_energy(nx_samples, ny_samples)) gparams = T.grad(cost, self.params, consider_constant=[nx_samples, ny_samples]) updates = {} for gparam, param in zip(gparams, self.params): updates[param] = param - gparam * T.cast(self.learning_rate, dtype=theano.config.floatX) monitoring_cost = T.nnet.binary_crossentropy(self.y, ny_samples).mean() return monitoring_cost, updates def save(self, tag=None): if tag == None: tag = "" else: tag = "_%s" % tag numpy.save("rbm_W%s.npy" % tag, self.W.get_value(borrow=True)) numpy.save("rbm_U%s.npy" % tag, self.U.get_value(borrow=True)) numpy.save("rbm_b%s.npy" % tag, self.b.get_value(borrow=True)) numpy.save("rbm_c%s.npy" % tag, self.c.get_value(borrow=True)) numpy.save("rbm_d%s.npy" % tag, self.d.get_value(borrow=True))
class RBMReplSoftmax(RBM): def __init__(self, num_vis, num_hid, train_params, from_cache=True): self.input = T.matrix('input') self.numpy_rng = np.random.RandomState(1) self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30)) self.num_vis = num_vis self.num_hid = num_hid self.init_params() # initialize input layer for standalone RBM or layer0 of DBN self.epoch_ratio = theano.shared(np.zeros((1), dtype=theano.config.floatX), borrow=True) self.need_train = True self.D = T.sum(self.input, axis=1) #.dimshuffle(0,'x') self.params = [self.W, self.hbias, self.vbias] _, self.output = self.prop_up(self.input) self.hid_means = theano.shared(np.tile( np.asarray(train_params['sparse_target'], dtype=theano.config.floatX), self.num_hid), borrow=True) if from_cache: self.restore_from_cache(train_params) self.watches = [] self.watches_label = [] def save_model(self, train_params, path=CACHE_PATH): fileName = "rbm_rs_%s_%s_ep%s_sp%s.model" % ( self.num_vis, self.num_hid, train_params['max_epoch'], train_params['sparse_target']) fileName = os.path.join(path, fileName) save_file = open(fileName, 'wb') # this will overwrite current contents cPickle.dump(self.W.get_value(borrow=True), save_file, -1) cPickle.dump(self.vbias.get_value(borrow=True), save_file, -1) cPickle.dump(self.hbias.get_value(borrow=True), save_file, -1) save_file.close() def restore_from_cache(self, train_params, path=CACHE_PATH): fileName = "rbm_rs_%s_%s_ep%s_sp%s.model" % ( self.num_vis, self.num_hid, train_params['max_epoch'], train_params['sparse_target']) fileName = os.path.join(path, fileName) if os.path.isfile(fileName): fileName_p = open(fileName, 'r') self.W.set_value(cPickle.load(fileName_p), borrow=True) self.vbias.set_value(cPickle.load(fileName_p), borrow=True) self.hbias.set_value(cPickle.load(fileName_p), borrow=True) fileName_p.close() self.need_train = False print "Model file %s was found. rbm.need_train flag turned to False" % fileName else: print "Model file was not found. Need to call RBM.save_model()" def init_W(self): initial_W = np.asarray( 0.001 * self.numpy_rng.randn(self.num_vis, self.num_hid), dtype=theano.config.floatX) self.W = theano.shared(value=initial_W, name='W', borrow=True) self.W_inc = theano.shared(value=np.zeros((self.num_vis, self.num_hid), dtype=theano.config.floatX), name='W_inc', borrow=True) def init_hbias(self): self.hbias = theano.shared(value=np.zeros(self.num_hid, dtype=theano.config.floatX), name='hbias', borrow=True) self.hbias_inc = theano.shared(value=np.zeros( self.num_hid, dtype=theano.config.floatX), name='hbias_inc', borrow=True) def init_vbias(self): self.vbias = theano.shared(value=np.zeros(self.num_vis, dtype=theano.config.floatX), name='vbias', borrow=True) self.vbias_inc = theano.shared(value=np.zeros( self.num_vis, dtype=theano.config.floatX), name='vbias_inc', borrow=True) def init_params(self): self.init_W() self.init_vbias() self.init_hbias() def prop_up(self, vis, D=None): if D == None: D = self.D pre_sigmoid_activation = T.dot(vis, self.W) + T.outer(D, self.hbias) return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)] def prop_down(self, hid): pre_softmax_activation = T.dot(hid, self.W.T) + self.vbias return [pre_softmax_activation, T.nnet.softmax(pre_softmax_activation)] def free_energy(self, v_sample): D = T.sum(v_sample, axis=1) wx_b = T.dot(v_sample, self.W) + T.outer(D, self.hbias) vbias_term = T.dot(v_sample, self.vbias) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) return -hidden_term - vbias_term def sample_v_given_h(self, h_sample, D=None): if D == None: D = self.D pre_softmax_v, v_mean = self.prop_down(h_sample) v_mean = v_mean / T.sum(v_mean, axis=1).dimshuffle(0, 'x') v_samples, updates = theano.scan(fn=self.multinom_sampler, non_sequences=[v_mean, D], n_steps=1) self.updates = updates #v_sample = T.mean(v_samples, axis=0) v_sample = v_samples[-1] return [pre_softmax_v, v_mean, v_sample] def multinom_sampler(self, probs, D): v_sample = self.theano_rng.multinomial(n=D, pvals=probs, dtype=theano.config.floatX) return v_sample def sample_v_given_h_mf(self, h_sample, D=None): if D == None: D = self.D pre_softmax_v, v_mean = self.prop_down(h_sample) v_sample = D.dimshuffle(0, 'x') * v_mean return [pre_softmax_v, v_mean, v_sample] def sample_h_given_v(self, v_sample, D=None): if D == None: D = self.D pre_sigmoid_h, h_mean = self.prop_up(v_sample, D) h_sample = self.theano_rng.binomial(size=h_mean.shape, n=1, p=h_mean, dtype=theano.config.floatX) return [pre_sigmoid_h, h_mean, h_sample] def gibbs_hvh(self, h0_sample): pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample) pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample) return [ pre_softmax_v1, v1_mean, v1_sample, pre_sigmoid_h1, h1_mean, h1_sample ] def gibbs_hvh_mf(self, h0_sample): pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h_mf( h0_sample) pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample) return [ pre_softmax_v1, v1_mean, v1_sample, pre_sigmoid_h1, h1_mean, h1_sample ] def gibbs_vhv(self, v0_sample, D): pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v( v0_sample, D) pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h( h1_sample, D) return [ pre_sigmoid_h1, h1_mean, h1_sample, pre_softmax_v1, v1_mean, v1_sample ] def gibbs_vhv_mf(self, v0_sample, D): pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v( v0_sample, D) pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h_mf( h1_sample, D) return [ pre_sigmoid_h1, h1_mean, h1_sample, pre_softmax_v1, v1_mean, v1_sample ] def add_watch(self, w, name): self.watches.append(w) self.watches_label.append(name) def clean_wacthes(self): self.watches = [] self.watches_label = [] def get_cost_updates(self, train_params): l_rate = T.cast(train_params['learning_rate'], dtype=theano.config.floatX) weight_decay = T.cast(train_params['weight_decay'], dtype=theano.config.floatX) momentum = T.cast(train_params['momentum'], dtype=theano.config.floatX) init_momentum = T.cast(train_params['init_momentum'], dtype=theano.config.floatX) moment_start = train_params['moment_start'] batch_size = T.cast(train_params['batch_size'], dtype=theano.config.floatX) cd_steps = train_params['cd_steps'] persistent = train_params['persistent'] persistent_on = train_params['persistent_on'] batch_size = T.cast(train_params['batch_size'], dtype=theano.config.floatX) sparse_damping = T.cast(train_params['sparse_damping'], dtype=theano.config.floatX) sparse_cost = T.cast(train_params['sparse_cost'], dtype=theano.config.floatX) sparse_target = T.cast(train_params['sparse_target'], dtype=theano.config.floatX) # compute positive phase pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input) self.add_watch(self.input, "vis_s") self.add_watch(ph_mean, "hid_m") if persistent_on: if T.eq(T.sum(T.sum(persistent, axis=1)), 0): chain_start = ph_sample else: chain_start = persistent else: chain_start = ph_sample if train_params['mean_field']: gibbs_hvh_fun = self.gibbs_hvh_mf else: gibbs_hvh_fun = self.gibbs_hvh [pre_softmax_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means, nh_samples], updates = \ theano.scan(gibbs_hvh_fun, outputs_info=[None, None, None, None, None, chain_start], n_steps=cd_steps) vis_samp_fant = nv_samples[-1] hid_probs_fant = nh_means[-1] self.add_watch(vis_samp_fant, "neg_vis_s") self.add_watch(hid_probs_fant, "neg_hid_m") cur_momentum = T.switch(T.lt(self.epoch_ratio[0], moment_start), init_momentum, momentum) # sparsity stuff hid_means = sparse_damping * self.hid_means + ( 1 - sparse_damping) * T.sum(ph_mean, axis=0) / batch_size sparse_grads = sparse_cost * (T.tile(hid_means.dimshuffle('x', 0), (train_params['batch_size'], 1)) - sparse_target) self.add_watch(hid_means, "hid_means") self.add_watch(sparse_grads, "sparse_grads") # updates W_inc = (T.dot(self.input.T, ph_mean) - T.dot(vis_samp_fant.T, hid_probs_fant) - T.dot(self.input.T, sparse_grads)) / batch_size - self.W * weight_decay hbias_inc = (T.sum(ph_mean, axis=0) - T.sum(hid_probs_fant, axis=0) - T.sum(sparse_grads, axis=0)) / batch_size # W_inc = ( T.dot(self.input.T, ph_mean) - T.dot(vis_samp_fant.T, hid_probs_fant) )/batch_size - self.W * weight_decay # hbias_inc = (T.sum(ph_mean, axis=0) - T.sum(hid_probs_fant,axis=0) )/batch_size vbias_inc = (T.sum(self.input, axis=0) - T.sum(vis_samp_fant, axis=0)) / batch_size W_inc_rate = (self.W_inc * cur_momentum + W_inc) * l_rate hbias_inc_rate = (self.hbias_inc * cur_momentum + hbias_inc) * l_rate vbias_inc_rate = (self.vbias_inc * cur_momentum + vbias_inc) * l_rate updates[self.W] = self.W + W_inc_rate updates[self.hbias] = self.hbias + hbias_inc_rate updates[self.vbias] = self.vbias + vbias_inc_rate updates[self.W_inc] = W_inc updates[self.hbias_inc] = hbias_inc updates[self.vbias_inc] = vbias_inc updates[self.hid_means] = hid_means self.add_watch(T.as_tensor_variable(self.W), "W") # self.add_watch(T.as_tensor_variable(self.hbias), "hbias") # self.add_watch(T.as_tensor_variable(self.vbias), "vbias") self.add_watch(W_inc_rate, "W_inc") # self.add_watch(hbias_inc_rate, "hbias_inc") # self.add_watch(vbias_inc_rate, "vbias_inc") current_free_energy = T.mean(self.free_energy(self.input)) self.add_watch(T.mean(self.free_energy(self.input)), 'free_en') if persistent_on: updates[persistent] = nh_samples[-1] monitoring_cost = self.get_reconstruction_cost(vis_samp_fant) else: # reconstruction cross-entropy is a better proxy for CD monitoring_cost = self.get_reconstruction_cost(vis_samp_fant) self.add_watch(monitoring_cost, "cost") return monitoring_cost, current_free_energy, T.mean( W_inc_rate), updates def get_pseudo_likelihood_cost(self, updates): """Stochastic approximation to the pseudo-likelihood""" bit_i_idx = theano.shared(value=0, name='bit_i_idx') xi = T.round(self.input) fe_xi = self.free_energy(xi) xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx]) fe_xi_flip = self.free_energy(xi_flip) cost = T.mean(self.num_vis * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))) updates[bit_i_idx] = (bit_i_idx + 1) % self.num_vis return cost def get_reconstruction_cost(self, vis_sample, vis_source=None, D=None): if not vis_source: return T.sum( (T.sum(T.sqr(self.input - vis_sample), axis=1)) / self.D) return T.sum((T.sum(T.sqr(vis_source - vis_sample), axis=1)) / D)
outputs_info=[None], non_sequences=[init_multi_samp,Tweights,nsteps], n_steps=ntest) sample_metropolis=theano.function([Tweights, nsteps],Tm_samps, allow_input_downcast=True) ##setting up Theano sampling ======================================= nummat=np.repeat(np.reshape(np.arange(npcl),(npcl,1)),npcl,axis=1) idx_mat=theano.shared(nummat.T) Tprobs=T.fvector() t_samp=rng.multinomial(size=Tprobs.shape,pvals=Tprobs) idxs=T.cast(T.sum(t_samp*idx_mat,axis=1),'int64') sample_theano=theano.function([Tprobs],idxs,allow_input_downcast=True) ## Speed test weights=np.random.rand(npcl) probs=weights/np.sum(weights) m_samps=np.zeros((ntest,npcl)) t_samps=np.zeros((ntest,npcl))
class SRNN(Model): def __init__( self, name, # a string for identifying model. numvis, numhid, numframes, output_type='real', cheating_level=.0, # cheating by lookig at x_t (instead of x_tm1) numpy_rng=None, theano_rng=None): super(SRNN, self).__init__(name=name) # store arguments self.numvis = numvis self.numhid = numhid self.numframes = numframes self.output_type = output_type self.selectionthreshold = 0.0 self.cheating_level = theano.shared(np.float32(cheating_level)) if not numpy_rng: self.numpy_rng = np.random.RandomState(1) else: self.numpy_rng = numpy_rng if not theano_rng: self.theano_rng = RandomStreams(1) else: self.theano_rng = theano_rng # create input var self.inputs = T.matrix(name='inputs') # set up params self.whh = theano.shared(value=np.eye(self.numhid).astype( theano.config.floatX), name='whh') self.whx = theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numhid, self.numvis)).astype(theano.config.floatX), name='whx') self.wxh = theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numvis, self.numhid)).astype(theano.config.floatX), name='wxh') self.bx = theano.shared( value=0.0 * np.ones(self.numvis, dtype=theano.config.floatX), name='bx') self.params = [self.whh, self.whx, self.wxh, self.bx] self._batchsize = self.inputs.shape[0] # reshape input var from 2D [ Bx(NxT) ] to 3D [ TxBxN ] (time, batch, numvis) self._input_frames = self.inputs.reshape( (self._batchsize, self.inputs.shape[1] // self.numvis, self.numvis)).transpose(1, 0, 2) # one-step prediction, used by sampling function self.hids_0 = T.zeros((self._batchsize, self.numhid)) self.hids_1 = T.dot(self.hids_0, self.whh) + T.dot( self._input_frames[0], self.wxh) self.hids_1 = self.hids_1 * (self.hids_1 > self.selectionthreshold) self.x_pred_1 = T.dot(self.hids_1, self.whx) + self.bx def step( x_gt_t, # cheating by looking at the current time step input. x_tm1, hids_tm1): pre_hids_t = T.dot(hids_tm1, self.whh) + T.dot( self.cheating_level * x_gt_t + (1. - self.cheating_level) * x_tm1, self.wxh) hids_t = pre_hids_t * (pre_hids_t > self.selectionthreshold) x_pred_t = T.dot(hids_t, self.whx) + self.bx return x_pred_t, hids_t (self._predictions, self.hids), self.updates = theano.scan( fn=step, sequences=self._input_frames, outputs_info=[self._input_frames[0], self.hids_0]) # set up output prediction if self.output_type == 'real': self._prediction = self._predictions[:, :, :self.numvis] elif self.output_type == 'binary': self._prediction = sigmoid(self._predictions[:, :, :self.numvis]) elif self.output_type == 'softmax': # softmax doesn't support 3d tensors, reshape batch and time axis # together, apply softmax and reshape back to 3d tensor self._prediction = T.nnet.softmax( self._predictions[:, :, :self.numvis].reshape( (self._predictions.shape[0] * self._predictions.shape[1], self.numvis))).reshape( (self._predictions.shape[0], self._predictions.shape[1], self.numvis)) else: raise ValueError('unsupported output_type') # set cost self._prediction_for_training = self._prediction[:self.numframes - 1] if self.output_type == 'real': self._cost = T.mean((self._prediction_for_training - self._input_frames[1:self.numframes])**2) self._cost_varlen = T.mean( (self._prediction - self._input_frames[1:])**2) elif self.output_type == 'binary': self._cost = -T.mean(self._input_frames[1:self.numframes] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[4:self.numframes]) * T.log(1.0 - self._prediction)) self._cost_varlen = -T.mean( self._input_frames[1:] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:]) * T.log(1.0 - self._prediction)) elif self.output_type == 'softmax': self._cost = -T.mean( T.log(self._prediction_for_training) * self._input_frames[1:self.numframes]) self._cost_varlen = -T.mean( T.log(self._prediction) * self._input_frames[1:]) # set gradients self._grads = T.grad(self._cost, self.params) # theano function for computing cost and grad self.cost = theano.function([self.inputs], self._cost, updates=self.updates) self.grads = theano.function([self.inputs], self._grads, updates=self.updates) # another set of variables # give some time steps of characters and free the model to predict for all the rest. self.inputs_var = T.fmatrix('inputs_var') self.nsteps = T.lscalar('nsteps') givens = {} givens[self.inputs] = T.concatenate( (self.inputs_var[:, :self.numvis], T.zeros((self.inputs_var.shape[0], self.nsteps * self.numvis))), axis=1) self.predict = theano.function( [ self.inputs_var, theano.Param(self.nsteps, default=self.numframes - 4) ], self._prediction.transpose(1, 0, 2).reshape( (self.inputs_var.shape[0], self.nsteps * self.numvis)), updates=self.updates, givens=givens) def grad(self, x): def get_cudandarray_value(x): if type(x) == theano.sandbox.cuda.CudaNdarray: return np.array(x.__array__()).flatten() else: return x.flatten() return np.concatenate( [get_cudandarray_value(g) for g in self.grads(x)]) def sample(self, numcases=1, numframes=10, temperature=1.0): assert self.output_type == 'softmax' next_prediction_and_state = theano.function( [self._input_frames, self.hids_0], [ self.theano_rng.multinomial( pvals=T.nnet.softmax(self.x_pred_1 / temperature)), self.hids_1 ]) preds = np.zeros((numcases, numframes, self.numvis), dtype="float32") preds[:, 0, :] = self.numpy_rng.multinomial( numcases, pvals=np.ones(self.numvis) / np.float(self.numvis)) hids = np.zeros((numcases, self.numhid), dtype="float32") for t in range(1, numframes): nextpredandstate = next_prediction_and_state( preds[:, [t - 1], :], hids) hids = nextpredandstate[1] preds[:, t, :] = nextpredandstate[0] return preds
class CharacterRNN(ParameterModel): def __init__(self, name, n_input, n_output, n_hidden=10, n_layers=2): super(CharacterRNN, self).__init__(name) self.n_hidden = n_hidden self.n_layers = n_layers self.n_input = n_input self.n_output = n_output self.rng = RandomStreams(seed=1337) self.lstm = LSTM('%s-charrnn' % name, self.n_input, n_hidden=self.n_hidden, n_layers=self.n_layers, rng=self.rng) self.output = Softmax('%s-softmax' % name, n_hidden, self.n_output) def save_parameters(self, location): state = { 'n_hidden': self.n_hidden, 'n_layers': self.n_layers, 'lstm': self.lstm.state(), 'output': self.output.state() } with open(location, 'wb') as fp: pickle.dump(state, fp) def load_parameters(self, location): with open(location, 'rb') as fp: state = pickle.load(fp) self.n_hidden = state['n_hidden'] self.n_layers = state['n_layers'] self.lstm.load(state['lstm']) self.output.load(state['output']) @theanify(T.tensor3('X'), T.tensor3('state'), T.tensor3('y')) def cost(self, X, state, y): (_, state, ypred), updates = self.forward(X, state) S, N, V = y.shape y = y.reshape((S * N, V)) ypred = ypred.reshape((S * N, V)) return (T.nnet.categorical_crossentropy(ypred, y).mean(), state), updates def forward(self, X, state): S, N, D = X.shape H = self.lstm.n_hidden L = self.lstm.n_layers O = self.output.n_output def step(input, previous_hidden, previous_state, previous_output): lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state) final_output = self.output.forward(lstm_hidden[:, -1, :], 1.0) return lstm_hidden, state, final_output hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) (encoder_output, encoder_state, softmax_output), updates = theano.scan(step, sequences=[X], outputs_info=[ hidden, state, T.alloc(np.asarray(0).astype(theano.config.floatX), N, O), ], n_steps=S) return (encoder_output, encoder_state, softmax_output), updates @theanify(T.fvector('start_token'), T.iscalar('length'), T.fscalar('temperature'), returns_updates=True) def generate(self, start_token, length, temperature): start_token = start_token[:, np.newaxis].T N = 1 H = self.lstm.n_hidden L = self.lstm.n_layers def step(input, previous_hidden, previous_state, temperature): lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state) final_output = self.output.forward(lstm_hidden[:, -1, :], temperature) sample = self.rng.multinomial(n=1, size=(1,), pvals=final_output, dtype=theano.config.floatX) return sample, lstm_hidden, state hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) (softmax_output, _, _), updates = theano.scan(step, outputs_info=[ start_token, hidden, state, ], non_sequences=[temperature], n_steps=length) return softmax_output[:, 0, :], updates @theanify(T.fvector('start_token'), T.fvector('concat'), T.iscalar('length'), T.fscalar('temperature'), returns_updates=True) def generate_with_concat(self, start_token, concat, length, temperature): start_token = start_token[:, np.newaxis].T concat = concat[:, np.newaxis].T N = 1 H = self.lstm.n_hidden L = self.lstm.n_layers def step(input, previous_hidden, previous_state, temperature, concat): lstm_hidden, state = self.lstm.forward(T.concatenate([input, concat], axis=1), previous_hidden, previous_state) final_output = self.output.forward(lstm_hidden[:, -1, :], temperature) sample = self.rng.multinomial(n=1, size=(1,), pvals=final_output, dtype=theano.config.floatX) return sample, lstm_hidden, state hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) (softmax_output, _, _), updates = theano.scan(step, outputs_info=[ start_token, hidden, state, ], non_sequences=[temperature, concat], n_steps=length) return softmax_output[:, 0, :], updates @theanify(T.fvector('start_token'), T.fvector('concat'), T.iscalar('length'), T.iscalar('num_examples'), T.fscalar('temperature'), returns_updates=True) def generate_examples(self, start_token, concat, length, num_examples, temperature): start_token = T.tile(start_token[:, np.newaxis].T, (num_examples, 1)) concat = T.tile(concat[:, np.newaxis].T, (num_examples, 1)) N = num_examples H = self.lstm.n_hidden L = self.lstm.n_layers def step(input, previous_hidden, previous_state, temperature, concat): lstm_hidden, state = self.lstm.forward(T.concatenate([input, concat], axis=1), previous_hidden, previous_state) final_output = self.output.forward(lstm_hidden[:, -1, :], temperature) sample = self.rng.multinomial(n=1, size=(num_examples,), pvals=final_output, dtype=theano.config.floatX) return sample, lstm_hidden, state hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) (softmax_output, _, _), updates = theano.scan(step, outputs_info=[ start_token, hidden, state, ], non_sequences=[temperature, concat], n_steps=length) return softmax_output[:, :, :], updates @theanify(T.tensor3('X'), returns_updates=True) def log_probability(self, X): S, N, D = X.shape H = self.lstm.n_hidden L = self.lstm.n_layers O = self.n_output def step(input, log_prob, previous_hidden, previous_state): lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state) final_output = self.output.forward(lstm_hidden[:, -1, :], 1.0) return final_output, lstm_hidden, state hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) start_log = T.alloc(np.array(0).astype(theano.config.floatX), N, O) state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H)) (log_prob, _, _), updates = theano.scan(step, sequences=[X], outputs_info=[ start_log, hidden, state, ], n_steps=S) return log_prob, updates def get_parameters(self): return self.lstm.get_parameters() + self.output.get_parameters()
class SetRBM(object): """ The Restricted Boltzmann Machine learning algorithm. """ def __init__(self, n_visibles, n_hiddens, n_classes, W=None, U=None, b=None, c=None, d=None, learning_rate=0.1, K=1): self.n_visibles = n_visibles self.n_hiddens = n_hiddens self.n_classes = n_classes self.x = T.matrix('x') self.y = T.vector('y') if W is None: W_value = numpy.asarray(numpy.random.normal(loc=0, scale=0.01, size=(n_visibles, n_hiddens)), dtype=theano.config.floatX) W = theano.shared(value=W_value, name='W') if U is None: U_value = numpy.asarray(numpy.random.normal(loc=0, scale=0.01, size=(n_classes, n_hiddens)), dtype=theano.config.floatX) U = theano.shared(value=U_value, name='W') if b is None: b = theano.shared(value=numpy.zeros(n_hiddens, dtype=theano.config.floatX), name='b') if c is None: c = theano.shared(value=numpy.zeros(n_visibles, dtype=theano.config.floatX), name='c') if d is None: d = theano.shared(value=numpy.zeros(n_classes, dtype=theano.config.floatX), name='d') self.W = W self.U = U self.b = b self.c = c self.d = d self.params = [self.W, self.U, self.b, self.c, self.d] self.theano_rng = RandomStreams(numpy.random.randint(2**30)) self.learning_rate = theano.shared( numpy.asarray(learning_rate, dtype=theano.config.floatX)) self.K = K cost, updates = self.__train() self.train = theano.function([self.x, self.y], cost, updates=updates) self.trainables = map(lambda x: x, updates) # TODO need way to compute to marginalize g from y #self.transform = theano.function([self.x], self._mean_g(self.x).sum(0)) self.output = theano.function([self.x], self._output(self.x)) def _free_energy(self, x, y): bias_term = T.dot(y, self.d) + T.dot(x, self.c) softmax_x = T.log( T.exp(self._softminus(T.dot(x, self.W) + self.b)).sum(0)) hidden_term = T.nnet.softplus(T.dot(y, self.U) + softmax_x).sum() return -bias_term - hidden_term def _output(self, x): softmax_x = T.log( T.exp(self._softminus(T.dot(x, self.W) + self.b)).sum(0)) output = -T.nnet.softplus(self.U + softmax_x).sum(1) return T.argmax(T.nnet.softmax(output)) def _softminus(self, x): return x - T.nnet.softplus(x) def _act(self, x, y): return self._softminus(self.b + T.dot(x, self.W)) + T.dot(y, self.U) def _mean_g(self, x, y): act = self._act(x, y) return T.exp(act) / (1. + T.exp(act).sum(0)), 1. / (1. + T.exp(act).sum(0)) def _mean_h(self, g, x): return T.maximum(g, T.nnet.sigmoid(T.dot(x, self.W) + self.b)) def _mean_x(self, h): return T.dot(h, self.W.T) + self.c def _mean_y(self, g): return T.nnet.softmax(T.dot(g, self.U.T).sum(0) + self.d) def _sample_g(self, x, y): g_mean, g_zeros = self._mean_g(x, y) g_mean = T.concatenate((g_zeros.dimshuffle('x', 0), g_mean)) g_sample = self.theano_rng.multinomial( n=1, pvals=g_mean.T, dtype=theano.config.floatX).T[1:] return g_sample def _sample_h(self, g, x): h_mean = self._mean_h(g, x) h_sample = self.theano_rng.binomial(size=h_mean.shape, n=1, p=h_mean, dtype=theano.config.floatX) return h_sample def _sample_x(self, h): x_mean = self._mean_x(h) x_sample = self.theano_rng.binomial(size=x_mean.shape, n=1, p=x_mean, dtype=theano.config.floatX) return x_sample def _sample_y(self, g): y_mean = self._mean_y(g) y_sample = self.theano_rng.multinomial(n=1, pvals=y_mean, dtype=theano.config.floatX) return y_sample def __train(self): nx_samples = self.x ng_samples = self._sample_g(self.x, self.y) for _ in range(self.K): nh_samples = self._sample_h(ng_samples, nx_samples) nx_samples = self._mean_x(nh_samples) ny_samples = self._sample_y(ng_samples) ng_samples = self._sample_g(nx_samples, ny_samples) cost = T.mean(self._free_energy(self.x, self.y)) \ - T.mean(self._free_energy(nx_samples, ny_samples)) gparams = T.grad(cost, self.params, consider_constant=[nx_samples, ny_samples]) updates = {} for gparam, param in zip(gparams, self.params): updates[param] = param - gparam * T.cast( self.learning_rate, dtype=theano.config.floatX) monitoring_cost = T.nnet.binary_crossentropy(self.y, ny_samples).mean() return monitoring_cost, updates def save(self, tag=None): if tag == None: tag = "" else: tag = "_%s" % tag numpy.save("rbm_W%s.npy" % tag, self.W.get_value(borrow=True)) numpy.save("rbm_U%s.npy" % tag, self.U.get_value(borrow=True)) numpy.save("rbm_b%s.npy" % tag, self.b.get_value(borrow=True)) numpy.save("rbm_c%s.npy" % tag, self.c.get_value(borrow=True)) numpy.save("rbm_d%s.npy" % tag, self.d.get_value(borrow=True))
class RBMReplSoftmax(RBM): def __init__(self, num_vis, num_hid, train_params, from_cache=True): self.input = T.matrix("input") self.numpy_rng = np.random.RandomState(1) self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30)) self.num_vis = num_vis self.num_hid = num_hid self.init_params() # initialize input layer for standalone RBM or layer0 of DBN self.epoch_ratio = theano.shared(np.zeros((1), dtype=theano.config.floatX), borrow=True) self.need_train = True self.D = T.sum(self.input, axis=1) # .dimshuffle(0,'x') self.params = [self.W, self.hbias, self.vbias] _, self.output = self.prop_up(self.input) self.hid_means = theano.shared( np.tile(np.asarray(train_params["sparse_target"], dtype=theano.config.floatX), self.num_hid), borrow=True ) if from_cache: self.restore_from_cache(train_params) self.watches = [] self.watches_label = [] def save_model(self, train_params, path=CACHE_PATH): fileName = "rbm_rs_%s_%s_ep%s_sp%s.model" % ( self.num_vis, self.num_hid, train_params["max_epoch"], train_params["sparse_target"], ) fileName = os.path.join(path, fileName) save_file = open(fileName, "wb") # this will overwrite current contents cPickle.dump(self.W.get_value(borrow=True), save_file, -1) cPickle.dump(self.vbias.get_value(borrow=True), save_file, -1) cPickle.dump(self.hbias.get_value(borrow=True), save_file, -1) save_file.close() def restore_from_cache(self, train_params, path=CACHE_PATH): fileName = "rbm_rs_%s_%s_ep%s_sp%s.model" % ( self.num_vis, self.num_hid, train_params["max_epoch"], train_params["sparse_target"], ) fileName = os.path.join(path, fileName) if os.path.isfile(fileName): fileName_p = open(fileName, "r") self.W.set_value(cPickle.load(fileName_p), borrow=True) self.vbias.set_value(cPickle.load(fileName_p), borrow=True) self.hbias.set_value(cPickle.load(fileName_p), borrow=True) fileName_p.close() self.need_train = False print "Model file %s was found. rbm.need_train flag turned to False" % fileName else: print "Model file was not found. Need to call RBM.save_model()" def init_W(self): initial_W = np.asarray(0.001 * self.numpy_rng.randn(self.num_vis, self.num_hid), dtype=theano.config.floatX) self.W = theano.shared(value=initial_W, name="W", borrow=True) self.W_inc = theano.shared( value=np.zeros((self.num_vis, self.num_hid), dtype=theano.config.floatX), name="W_inc", borrow=True ) def init_hbias(self): self.hbias = theano.shared(value=np.zeros(self.num_hid, dtype=theano.config.floatX), name="hbias", borrow=True) self.hbias_inc = theano.shared( value=np.zeros(self.num_hid, dtype=theano.config.floatX), name="hbias_inc", borrow=True ) def init_vbias(self): self.vbias = theano.shared(value=np.zeros(self.num_vis, dtype=theano.config.floatX), name="vbias", borrow=True) self.vbias_inc = theano.shared( value=np.zeros(self.num_vis, dtype=theano.config.floatX), name="vbias_inc", borrow=True ) def init_params(self): self.init_W() self.init_vbias() self.init_hbias() def prop_up(self, vis, D=None): if D == None: D = self.D pre_sigmoid_activation = T.dot(vis, self.W) + T.outer(D, self.hbias) return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)] def prop_down(self, hid): pre_softmax_activation = T.dot(hid, self.W.T) + self.vbias return [pre_softmax_activation, T.nnet.softmax(pre_softmax_activation)] def free_energy(self, v_sample): D = T.sum(v_sample, axis=1) wx_b = T.dot(v_sample, self.W) + T.outer(D, self.hbias) vbias_term = T.dot(v_sample, self.vbias) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) return -hidden_term - vbias_term def sample_v_given_h(self, h_sample, D=None): if D == None: D = self.D pre_softmax_v, v_mean = self.prop_down(h_sample) v_mean = v_mean / T.sum(v_mean, axis=1).dimshuffle(0, "x") v_samples, updates = theano.scan(fn=self.multinom_sampler, non_sequences=[v_mean, D], n_steps=1) self.updates = updates # v_sample = T.mean(v_samples, axis=0) v_sample = v_samples[-1] return [pre_softmax_v, v_mean, v_sample] def multinom_sampler(self, probs, D): v_sample = self.theano_rng.multinomial(n=D, pvals=probs, dtype=theano.config.floatX) return v_sample def sample_v_given_h_mf(self, h_sample, D=None): if D == None: D = self.D pre_softmax_v, v_mean = self.prop_down(h_sample) v_sample = D.dimshuffle(0, "x") * v_mean return [pre_softmax_v, v_mean, v_sample] def sample_h_given_v(self, v_sample, D=None): if D == None: D = self.D pre_sigmoid_h, h_mean = self.prop_up(v_sample, D) h_sample = self.theano_rng.binomial(size=h_mean.shape, n=1, p=h_mean, dtype=theano.config.floatX) return [pre_sigmoid_h, h_mean, h_sample] def gibbs_hvh(self, h0_sample): pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample) pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample) return [pre_softmax_v1, v1_mean, v1_sample, pre_sigmoid_h1, h1_mean, h1_sample] def gibbs_hvh_mf(self, h0_sample): pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h_mf(h0_sample) pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample) return [pre_softmax_v1, v1_mean, v1_sample, pre_sigmoid_h1, h1_mean, h1_sample] def gibbs_vhv(self, v0_sample, D): pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample, D) pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample, D) return [pre_sigmoid_h1, h1_mean, h1_sample, pre_softmax_v1, v1_mean, v1_sample] def gibbs_vhv_mf(self, v0_sample, D): pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample, D) pre_softmax_v1, v1_mean, v1_sample = self.sample_v_given_h_mf(h1_sample, D) return [pre_sigmoid_h1, h1_mean, h1_sample, pre_softmax_v1, v1_mean, v1_sample] def add_watch(self, w, name): self.watches.append(w) self.watches_label.append(name) def clean_wacthes(self): self.watches = [] self.watches_label = [] def get_cost_updates(self, train_params): l_rate = T.cast(train_params["learning_rate"], dtype=theano.config.floatX) weight_decay = T.cast(train_params["weight_decay"], dtype=theano.config.floatX) momentum = T.cast(train_params["momentum"], dtype=theano.config.floatX) init_momentum = T.cast(train_params["init_momentum"], dtype=theano.config.floatX) moment_start = train_params["moment_start"] batch_size = T.cast(train_params["batch_size"], dtype=theano.config.floatX) cd_steps = train_params["cd_steps"] persistent = train_params["persistent"] persistent_on = train_params["persistent_on"] batch_size = T.cast(train_params["batch_size"], dtype=theano.config.floatX) sparse_damping = T.cast(train_params["sparse_damping"], dtype=theano.config.floatX) sparse_cost = T.cast(train_params["sparse_cost"], dtype=theano.config.floatX) sparse_target = T.cast(train_params["sparse_target"], dtype=theano.config.floatX) # compute positive phase pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input) self.add_watch(self.input, "vis_s") self.add_watch(ph_mean, "hid_m") if persistent_on: if T.eq(T.sum(T.sum(persistent, axis=1)), 0): chain_start = ph_sample else: chain_start = persistent else: chain_start = ph_sample if train_params["mean_field"]: gibbs_hvh_fun = self.gibbs_hvh_mf else: gibbs_hvh_fun = self.gibbs_hvh [pre_softmax_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means, nh_samples], updates = theano.scan( gibbs_hvh_fun, outputs_info=[None, None, None, None, None, chain_start], n_steps=cd_steps ) vis_samp_fant = nv_samples[-1] hid_probs_fant = nh_means[-1] self.add_watch(vis_samp_fant, "neg_vis_s") self.add_watch(hid_probs_fant, "neg_hid_m") cur_momentum = T.switch(T.lt(self.epoch_ratio[0], moment_start), init_momentum, momentum) # sparsity stuff hid_means = sparse_damping * self.hid_means + (1 - sparse_damping) * T.sum(ph_mean, axis=0) / batch_size sparse_grads = sparse_cost * ( T.tile(hid_means.dimshuffle("x", 0), (train_params["batch_size"], 1)) - sparse_target ) self.add_watch(hid_means, "hid_means") self.add_watch(sparse_grads, "sparse_grads") # updates W_inc = ( T.dot(self.input.T, ph_mean) - T.dot(vis_samp_fant.T, hid_probs_fant) - T.dot(self.input.T, sparse_grads) ) / batch_size - self.W * weight_decay hbias_inc = (T.sum(ph_mean, axis=0) - T.sum(hid_probs_fant, axis=0) - T.sum(sparse_grads, axis=0)) / batch_size # W_inc = ( T.dot(self.input.T, ph_mean) - T.dot(vis_samp_fant.T, hid_probs_fant) )/batch_size - self.W * weight_decay # hbias_inc = (T.sum(ph_mean, axis=0) - T.sum(hid_probs_fant,axis=0) )/batch_size vbias_inc = (T.sum(self.input, axis=0) - T.sum(vis_samp_fant, axis=0)) / batch_size W_inc_rate = (self.W_inc * cur_momentum + W_inc) * l_rate hbias_inc_rate = (self.hbias_inc * cur_momentum + hbias_inc) * l_rate vbias_inc_rate = (self.vbias_inc * cur_momentum + vbias_inc) * l_rate updates[self.W] = self.W + W_inc_rate updates[self.hbias] = self.hbias + hbias_inc_rate updates[self.vbias] = self.vbias + vbias_inc_rate updates[self.W_inc] = W_inc updates[self.hbias_inc] = hbias_inc updates[self.vbias_inc] = vbias_inc updates[self.hid_means] = hid_means self.add_watch(T.as_tensor_variable(self.W), "W") # self.add_watch(T.as_tensor_variable(self.hbias), "hbias") # self.add_watch(T.as_tensor_variable(self.vbias), "vbias") self.add_watch(W_inc_rate, "W_inc") # self.add_watch(hbias_inc_rate, "hbias_inc") # self.add_watch(vbias_inc_rate, "vbias_inc") current_free_energy = T.mean(self.free_energy(self.input)) self.add_watch(T.mean(self.free_energy(self.input)), "free_en") if persistent_on: updates[persistent] = nh_samples[-1] monitoring_cost = self.get_reconstruction_cost(vis_samp_fant) else: # reconstruction cross-entropy is a better proxy for CD monitoring_cost = self.get_reconstruction_cost(vis_samp_fant) self.add_watch(monitoring_cost, "cost") return monitoring_cost, current_free_energy, T.mean(W_inc_rate), updates def get_pseudo_likelihood_cost(self, updates): """Stochastic approximation to the pseudo-likelihood""" bit_i_idx = theano.shared(value=0, name="bit_i_idx") xi = T.round(self.input) fe_xi = self.free_energy(xi) xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx]) fe_xi_flip = self.free_energy(xi_flip) cost = T.mean(self.num_vis * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))) updates[bit_i_idx] = (bit_i_idx + 1) % self.num_vis return cost def get_reconstruction_cost(self, vis_sample, vis_source=None, D=None): if not vis_source: return T.sum((T.sum(T.sqr(self.input - vis_sample), axis=1)) / self.D) return T.sum((T.sum(T.sqr(vis_source - vis_sample), axis=1)) / D)
class SRNN(Model): def __init__(self, name, numvis, numhid, numlayers, numframes, output_type='real', dropout=0.0, numpy_rng=None, theano_rng=None): super(SRNN, self).__init__(name=name) self.numvis = numvis # frame length * alphabet size (1 * 27) self.numhid = numhid # 512 self.numlayers = numlayers # 3 self.numframes = numframes # maxnumframes (100) self.output_type = output_type # softmax self.dropout = dropout # 0.5 if not numpy_rng: self.numpy_rng = np.random.RandomState(1) else: self.numpy_rng = numpy_rng if not theano_rng: self.theano_rng = RandomStreams(1) else: self.theano_rng = theano_rng self.inputs = T.matrix(name='inputs') self.whh = [theano.shared(value=np.eye(self.numhid).astype(theano.config.floatX), name='whh'+str(k)) for k in range(self.numlayers)] self.whx = [theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numhid, self.numvis)).astype(theano.config.floatX), name='whx'+str(k)) for k in range(self.numlayers)] self.wxh = [theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numvis, self.numhid)).astype(theano.config.floatX), name='wxh'+str(0))] self.wxh = self.wxh + [theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numhid, self.numhid)).astype(theano.config.floatX), name='wxh'+str(k)) for k in range(self.numlayers-1)] self.bx = theano.shared(value=0.0 * np.ones( self.numvis, dtype=theano.config.floatX), name='bx') self.bhid = [theano.shared(value=0.0 * np.ones( self.numhid, dtype=theano.config.floatX), name='bhid'+str(k)) for k in range(self.numlayers)] self.params = self.whh + self.whx + self.wxh + self.bhid + [self.bx] self._batchsize = self.inputs.shape[0] self._input_frames = self.inputs.reshape(( self._batchsize, self.inputs.shape[1] // self.numvis, self.numvis)).transpose(1, 0, 2) #1-step prediction --- self.hids_0 = T.zeros((self._batchsize, self.numhid*self.numlayers)) self.hids_1 = [T.dot(self.hids_0[:,:self.numhid], self.whh[0]) + self.bhid[0] + T.dot(self._input_frames[0], self.wxh[0])] self.hids_1[0] *= (self.hids_1[0] > 0) for k in range(1, self.numlayers): self.hids_1.append(T.dot(self.hids_0[:,k*self.numhid:(k+1)*self.numhid], self.whh[k]) + self.bhid[k] + T.dot(self.hids_1[k-1], self.wxh[k])) self.hids_1[-1] *= (self.hids_1[-1] > 0) self.x_pred_1 = self.bx for k in range(self.numlayers): self.x_pred_1 += T.dot(self.hids_1[k], self.whx[k]) self.hids_1 = T.concatenate(self.hids_1, 1) #--- 1-step prediction def step_dropout(x_gt_t, dropoutmask, x_tm1, hids_tm1): hids_tm1 = [hids_tm1[:,k*self.numhid:(k+1)*self.numhid] for k in range(self.numlayers)] pre_hids_t = [T.dot(hids_tm1[0], self.whh[0]) + self.bhid[0] + T.dot(x_gt_t, self.wxh[0])] hids_t = [pre_hids_t[0] * (pre_hids_t[0] > 0)] for k in range(1, self.numlayers): pre_hids_t.append(T.dot(hids_tm1[k], self.whh[k]) + self.bhid[k] + T.dot(dropoutmask*hids_t[k-1], (1.0/self.dropout)*self.wxh[k])) hids_t.append(pre_hids_t[k] * (pre_hids_t[k] > 0)) x_pred_t = self.bx for k in range(self.numlayers): x_pred_t += T.dot(hids_t[k], self.whx[k]) return x_pred_t, T.concatenate(hids_t, 1) def step_nodropout(x_gt_t, x_tm1, hids_tm1): hids_tm1 = [hids_tm1[:,k*self.numhid:(k+1)*self.numhid] for k in range(self.numlayers)] pre_hids_t = [T.dot(hids_tm1[0], self.whh[0]) + self.bhid[0] + T.dot(x_gt_t, self.wxh[0])] hids_t = [pre_hids_t[0] * (pre_hids_t[0] > 0)] for k in range(1, self.numlayers): pre_hids_t.append(T.dot(hids_tm1[k], self.whh[k]) + self.bhid[k] + T.dot(hids_t[k-1], self.wxh[k])) hids_t.append(pre_hids_t[k] * (pre_hids_t[k] > 0)) x_pred_t = self.bx for k in range(self.numlayers): x_pred_t += T.dot(hids_t[k], self.whx[k]) return x_pred_t, T.concatenate(hids_t, 1) if self.dropout == 0.0: (self._predictions, self.hids), self.updates = theano.scan( fn=step_nodropout, sequences=self._input_frames, outputs_info=[self._input_frames[0], self.hids_0]) else: self._dropoutmask = theano_rng.binomial( size=(self.inputs.shape[1] // self.numvis, self._batchsize, self.numhid), n=1, p=self.dropout, dtype=theano.config.floatX ) (self._predictions, self.hids), self.updates = theano.scan( fn=step_dropout, sequences=[self._input_frames, self._dropoutmask], outputs_info=[self._input_frames[0], self.hids_0]) if self.output_type == 'real': self._prediction = self._predictions[:, :, :self.numvis] # dims: [time step, batch idx, numvis] elif self.output_type == 'binary': self._prediction = sigmoid(self._predictions[:, :, :self.numvis]) elif self.output_type == 'softmax': # softmax doesn't support 3d tensors, reshape batch and time axis # together, apply softmax and reshape back to 3d tensor self._prediction = T.nnet.softmax( self._predictions[:, :, :self.numvis].reshape(( self._predictions.shape[0] * self._predictions.shape[1], self.numvis )) ).reshape(( self._predictions.shape[0], self._predictions.shape[1], self.numvis )) else: raise ValueError('unsupported output_type') self._prediction_for_training = self._prediction[:self.numframes-1] if self.output_type == 'real': self._cost = T.mean(( self._prediction_for_training - self._input_frames[1:self.numframes])**2) self._cost_varlen = T.mean(( self._prediction - self._input_frames[1:])**2) # for various lengths elif self.output_type == 'binary': self._cost = -T.mean( self._input_frames[1:self.numframes] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:self.numframes]) * T.log( 1.0 - self._prediction)) self._cost_varlen = -T.mean( self._input_frames[1:] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:]) * T.log( 1.0 - self._prediction)) elif self.output_type == 'softmax': self._cost = -T.mean(T.log( self._prediction_for_training) * self._input_frames[1:self.numframes]) self._cost_varlen = -T.mean(T.log( self._prediction) * self._input_frames[1:]) self._grads = T.grad(self._cost, self.params) self.inputs_var = T.fmatrix('inputs_var') self.nsteps = T.lscalar('nsteps') givens = {} givens[self.inputs] = T.concatenate( ( self.inputs_var[:, :self.numvis], T.zeros((self.inputs_var.shape[0], self.nsteps*self.numvis)) ), axis=1) # predict given the first letters. self.predict = theano.function( [self.inputs_var, theano.Param(self.nsteps, default=self.numframes-4)], self._prediction.transpose(1, 0, 2).reshape((self.inputs_var.shape[0], self.nsteps*self.numvis)), updates=self.updates, givens=givens) self.cost = theano.function( [self.inputs], self._cost, updates=self.updates) self.grads = theano.function( [self.inputs], self._grads, updates=self.updates) def grad(self, x): def get_cudandarray_value(x): if type(x) == theano.sandbox.cuda.CudaNdarray: return np.array(x.__array__()).flatten() else: return x.flatten() return np.concatenate([get_cudandarray_value(g) for g in self.grads(x)]) def sample(self, numcases=1, numframes=10, temperature=1.0): assert self.output_type == 'softmax' next_prediction_and_state = theano.function([self._input_frames, self.hids_0], [self.theano_rng.multinomial(pvals=T.nnet.softmax(self.x_pred_1/temperature)), self.hids_1]) preds = np.zeros((numcases, numframes, self.numvis), dtype="float32") preds[:,0,:] = self.numpy_rng.multinomial(numcases, pvals=np.ones(self.numvis)/np.float(self.numvis)) hids = np.zeros((numcases, self.numhid*self.numlayers), dtype="float32") for t in range(1, numframes): nextpredandstate = next_prediction_and_state(preds[:,[t-1],:], hids) hids = nextpredandstate[1] preds[:,t,:] = nextpredandstate[0] return preds
class SRNN(Model): def __init__(self, name, # a string for identifying model. numvis, numsz, numrz, numsl, numrl, numframes, output_type='real', cheating_level=.0, # cheating by lookig at x_t (instead of x_tm1) numpy_rng=None, theano_rng=None): super(SRNN, self).__init__(name=name) # store arguments self.numvis = numvis self.numsz = numsz # stacked zae layer self.numrz = numrz # recurrent zae layer self.numsl = numsl # stacked linear layer self.numrl = numrl # recurrent linear layer self.numlayers = 3 # number of total stacked layers self.numrecur = 2 # number of recurrent connections self.numframes = numframes self.output_type = output_type self.selectionthreshold = 0.0 self.cheating_level = theano.shared(numpy.float32(cheating_level)) if not numpy_rng: self.numpy_rng = numpy.random.RandomState(1) else: self.numpy_rng = numpy_rng if not theano_rng: self.theano_rng = RandomStreams(1) else: self.theano_rng = theano_rng # create input var self.inputs = T.matrix(name='inputs') # self.inputs.tag.test_value = numpy.random.rand(20, 27*50).astype(theano.config.floatX) # set up params # recurrent connections: self.whh = [theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numsz, self.numsz) ).astype(theano.config.floatX), name='whl0'), theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numsz, self.numsz) ).astype(theano.config.floatX), name='whl1')] # vertical connections: self.whx = [theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numsz, self.numvis) ).astype(theano.config.floatX), name='whx0'), theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numsz, self.numvis) ).astype(theano.config.floatX), name='whx1')] self.wxh = [theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numvis, self.numsz) ).astype(theano.config.floatX), name='wxh0'), theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numsz, self.numsl) ).astype(theano.config.floatX), name='wxh1'), theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numsl, self.numsz) ).astype(theano.config.floatX), name='wxh2')] self.bx = theano.shared( value=0.0 * numpy.ones(self.numvis, dtype=theano.config.floatX), name='bx') self.params = self.whh + self.whx + self.wxh + [self.bx] self._batchsize = self.inputs.shape[0] # reshape input var from 2D [ Bx(NxT) ] to 3D [ TxBxN ] (time, batch, numvis) self._input_frames = self.inputs.reshape(( self._batchsize, self.inputs.shape[1] // self.numvis, self.numvis )).transpose(1, 0, 2) # one-step prediction, used by sampling function self.hids_t0 = [T.zeros((self._batchsize, self.numsz)), T.zeros((self._batchsize, self.numsl)), T.zeros((self._batchsize, self.numsz))] self.hids_t1 = [ReLU( T.dot(self.hids_t0[0], self.whh[0] ) + T.dot(self._input_frames[0], self.wxh[0]) )] self.hids_t1.append(T.dot(self.hids_t1[-1], self.wxh[1])) self.hids_t1.append(ReLU( T.dot(self.hids_t0[2], self.whh[1] ) + T.dot(self.hids_t1[-1], self.wxh[2]) )) self.x_pred_1 = self.bx + T.dot(self.hids_t1[0], self.whx[0]) + T.dot(self.hids_t1[2], self.whx[1]) # end of one-step prediction # pdb.set_trace() def step(x_tm1, hids_tm1): hids_tm1 = [hids_tm1[:, : self.numsz ], hids_tm1[:, self.numsz :(self.numsz +self.numsl)], hids_tm1[:, (self.numsz+self.numsl):(self.numsz*2+self.numsl)]] hids_t = [ReLU( T.dot(hids_tm1[0], self.whh[0] ) + T.dot(x_tm1, self.wxh[0]) )] hids_t.append(T.dot(hids_t[-1], self.wxh[1])) hids_t.append(ReLU( T.dot(hids_tm1[2], self.whh[1] ) + T.dot(hids_t[-1], self.wxh[2]) )) x_pred_t = self.bx + T.dot(hids_t[0], self.whx[0]) + T.dot(hids_t[2], self.whx[1]) return x_pred_t, T.concatenate(hids_t, 1) (self._predictions, self.hids), self.updates = theano.scan( fn=step, sequences=self._input_frames[:-1], outputs_info=[None, T.concatenate(self.hids_t0, 1)]) # set up output prediction if self.output_type == 'real': self._prediction = self._predictions[:, :, :self.numvis] elif self.output_type == 'binary': self._prediction = sigmoid(self._predictions[:, :, :self.numvis]) elif self.output_type == 'softmax': # softmax doesn't support 3d tensors, reshape batch and time axis # together, apply softmax and reshape back to 3d tensor self._prediction = T.nnet.softmax( self._predictions[:, :, :self.numvis].reshape(( self._predictions.shape[0] * self._predictions.shape[1], self.numvis )) ).reshape(( self._predictions.shape[0], self._predictions.shape[1], self.numvis )) else: raise ValueError('unsupported output_type') # set cost self._prediction_for_training = self._prediction[:self.numframes-1] if self.output_type == 'real': self._cost = T.mean(( self._prediction_for_training - self._input_frames[1:self.numframes] )**2) self._cost_varlen = T.mean(( self._prediction - self._input_frames[1:] )**2) elif self.output_type == 'binary': self._cost = -T.mean( self._input_frames[1:self.numframes] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[4:self.numframes]) * T.log( 1.0 - self._prediction)) self._cost_varlen = -T.mean( self._input_frames[1:] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:]) * T.log( 1.0 - self._prediction)) elif self.output_type == 'softmax': self._cost = -T.mean(T.log( self._prediction_for_training) * self._input_frames[1:self.numframes]) self._cost_varlen = -T.mean(T.log( self._prediction) * self._input_frames[1:]) # set gradients self._grads = T.grad(self._cost, self.params) # theano function for computing cost and grad self.cost = theano.function([self.inputs], self._cost, updates=self.updates) self.grads = theano.function([self.inputs], self._grads, updates=self.updates) # another set of variables # give some time steps of characters and free the model to predict for all the rest. self.inputs_var = T.fmatrix('inputs_var') self.nsteps = T.lscalar('nsteps') givens = {} givens[self.inputs] = T.concatenate( (self.inputs_var[:, :self.numvis], T.zeros((self.inputs_var.shape[0], self.nsteps*self.numvis)) ), axis=1) self.predict = theano.function( [self.inputs_var, theano.Param(self.nsteps, default=self.numframes-4)], self._prediction.transpose(1, 0, 2).reshape(( self.inputs_var.shape[0], self.nsteps*self.numvis)), updates=self.updates, givens=givens) def grad(self, x): def get_cudandarray_value(x): if type(x) == theano.sandbox.cuda.CudaNdarray: return numpy.array(x.__array__()).flatten() else: return x.flatten() return numpy.concatenate([get_cudandarray_value(g) for g in self.grads(x)]) def sample(self, numcases=1, numframes=10, temperature=1.0): assert self.output_type == 'softmax' next_prediction_and_state = theano.function( [self._input_frames, T.concatenate(self.hids_t0)], [self.theano_rng.multinomial(pvals=T.nnet.softmax(self.x_pred_1/temperature)), T.concatenate(self.hids_t1)] ) preds = numpy.zeros((numcases, numframes, self.numvis), dtype="float32") preds[:, 0, :] = self.numpy_rng.multinomial( numcases, pvals=numpy.ones(self.numvis)/numpy.float(self.numvis)) hids = numpy.zeros((numcases, self.numhid), dtype="float32") for t in range(1, numframes): nextpredandstate = next_prediction_and_state(preds[:,[t-1],:], hids) hids = nextpredandstate[1] preds[:,t,:] = nextpredandstate[0] return preds
class MDN(object): """Mixture Density Network """ def __init__(self, input, rng, n_in, n_hiddens, hid_activations, n_out, out_activation, n_components): """Initialize the parameters for the multilayer perceptron :type rng: np.random.RandomState :param rng: a random number generator used to initialize weights :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden_list: list of int :param n_hidden_list: a list of number of units in each hidden layer :type activations_list: list of lambdas :param n_hidden_list: a list of activations used in each hidden layer :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ from theano.tensor.shared_randomstreams import RandomStreams self.srng = RandomStreams(seed=1234) self.input = input # We are dealing with multiple hidden layers MLP layer0 = NetworkLayer(rng=rng, input=input, n_in=n_in, n_out=n_hiddens[0], activation=hid_activations[0]) h_layers = [('hiddenLayer0', layer0)] for i in range(1, len(n_hiddens)): h_layers.append(('hiddenLayer%d' % i, NetworkLayer(rng=rng, input=h_layers[i - 1][1].output, n_in=n_hiddens[i - 1], n_out=n_hiddens[i], activation=hid_activations[i]))) self.__dict__.update(dict(h_layers)) # The output layer gets as input the hidden units # of the hidden layer self.outputLayer = MDNoutputLayer(rng=rng, input=h_layers[-1][1].output, n_in=n_hiddens[-1], n_out=n_out, mu_activation=out_activation, n_components=n_components) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = (self.outputLayer.W_mu ** 2).sum() + \ (self.outputLayer.W_sigma ** 2).sum() +\ (self.outputLayer.W_mixing ** 2).sum() for i in range(len(n_hiddens)): self.L2_sqr += (self.__dict__['hiddenLayer%d' % i].W**2).sum() # the parameters of the model are the parameters of the all layers it # is made out of params = self.outputLayer.params for layer in h_layers: params.extend(layer[1].params) self.params = params def set_symbolic_input(self, input): """We use this function to bind a symbolic variable with the input of the network layer. Added to specify that in training time.""" self.input = input # def train(self, x, y, training_loss, learning_rate, def train(self, y, training_loss, learning_rate, n_epochs, train_x, train_y, valid_x, valid_y, batch_size): """Train the MLP using SGD""" index = T.iscalar() # index to a [mini]batch lr = T.scalar() # learning rate symbolic #index.tag.test_value = 1 gparams = [] for param in self.params: gparam = T.grad(training_loss, param) gparams.append(gparam) updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * \ T.cast(lr,dtype=theano.config.floatX))) train_model = theano.function( inputs=[index, lr], outputs=[training_loss], updates=updates, givens={ self.input: train_x[index * batch_size:(index + 1) * batch_size], y: train_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=NLL(mu=self.outputLayer.mu, sigma=self.outputLayer.sigma, mixing=self.outputLayer.mixing, y=y), givens={ self.input: valid_x[index * batch_size:(index + 1) * batch_size], y: valid_y[index * batch_size:(index + 1) * batch_size] }) # compute number of minibatches for training and validation n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size validate_MSE = theano.function( inputs=[index], outputs=MSE(self.samples(), y=y), givens={ self.input: valid_x[index * batch_size:(index + 1) * batch_size], y: valid_y[index * batch_size:(index + 1) * batch_size] }) print 'training...' start_time = time.clock() epoch = 0 total_training_costs = [] total_validation_costs = [] total_validation_MSE = [] lr_time = 0 lr_step = learning_rate / ( (train_x.get_value().shape[0] * 1.0 / batch_size) * (n_epochs - 30)) lr_val = learning_rate while (epoch < n_epochs): epoch = epoch + 1 epoch_training_costs = [] #import pdb; pdb.set_trace() for minibatch_index in xrange(n_train_batches): # linear annealing after 40 epochs... if epoch > 40: # lr_val = learning_rate / (1.0+lr_time) # lr_time = lr_time + 1 lr_val = lr_val - lr_step else: lr_val = learning_rate loss_value = \ train_model(minibatch_index, lr_val) epoch_training_costs.append(loss_value) if np.isnan(loss_value): print 'got NaN in NLL' sys.exit(1) this_training_cost = np.mean(epoch_training_costs) this_validation_cost = np.mean( [validate_model(i) for i in xrange(n_valid_batches)]) this_validation_MSE = np.mean( [validate_MSE(i) for i in xrange(n_valid_batches)]) total_training_costs.append(this_training_cost) total_validation_costs.append(this_validation_cost) total_validation_MSE.append(this_validation_MSE) print 'epoch %i, training NLL %f, validation NLL %f, MSE %f' %\ (epoch, this_training_cost,this_validation_cost, this_validation_MSE) end_time = time.clock() print "Training took %.2f minutes..." % ((end_time - start_time) / 60.) #return losses and parameters.. return total_training_costs, total_validation_costs, total_validation_MSE def samples(self): component = self.srng.multinomial(pvals=self.outputLayer.mixing) component_mean = T.sum(self.outputLayer.mu * \ component.dimshuffle(0,'x',1), axis=2) component_std = T.sum(self.outputLayer.sigma * \ component, axis=1, keepdims=True) samples = self.srng.normal(avg=component_mean, std=component_std) return samples def save_model(self, filename='MLP.save', output_folder='output_folder'): """ This function pickles the paramaters in a file for later usage """ storage_file = open(os.path.join(output_folder, filename), 'wb') cPickle.dump(self, storage_file, protocol=cPickle.HIGHEST_PROTOCOL) storage_file.close() @staticmethod def load_model(filename='MLP.save', output_folder='output_folder'): """ This function loads pickled paramaters from a file """ storage_file = open(os.path.join(output_folder, filename), 'rb') model = cPickle.load(storage_file) storage_file.close() return model
class SRNN(Model): def __init__(self, name, # a string for identifying model. numvis, numhid, numframes, output_type='real', cheating_level=.0, # cheating by lookig at x_t (instead of x_tm1) numpy_rng=None, theano_rng=None): super(SRNN, self).__init__(name=name) # store arguments self.numvis = numvis self.numhid = numhid self.numframes = numframes self.output_type = output_type self.selectionthreshold = 0.0 self.cheating_level = theano.shared(np.float32(cheating_level)) if not numpy_rng: self.numpy_rng = np.random.RandomState(1) else: self.numpy_rng = numpy_rng if not theano_rng: self.theano_rng = RandomStreams(1) else: self.theano_rng = theano_rng # create input var self.inputs = T.matrix(name='inputs') # set up params self.whh = theano.shared( value=np.eye(self.numhid).astype(theano.config.floatX), name='whh') self.whx = theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numhid, self.numvis) ).astype(theano.config.floatX), name='whx') self.wxh = theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numvis, self.numhid) ).astype(theano.config.floatX), name='wxh') self.bx = theano.shared( value=0.0 * np.ones(self.numvis, dtype=theano.config.floatX), name='bx') self.params = [self.whh, self.whx, self.wxh, self.bx] self._batchsize = self.inputs.shape[0] # reshape input var from 2D [ Bx(NxT) ] to 3D [ TxBxN ] (time, batch, numvis) self._input_frames = self.inputs.reshape(( self._batchsize, self.inputs.shape[1] // self.numvis, self.numvis )).transpose(1, 0, 2) # one-step prediction, used by sampling function self.hids_0 = T.zeros((self._batchsize, self.numhid)) self.hids_1 = T.dot(self.hids_0, self.whh) + T.dot(self._input_frames[0], self.wxh) self.hids_1 = self.hids_1 * (self.hids_1 > self.selectionthreshold) self.x_pred_1 = T.dot(self.hids_1, self.whx) + self.bx def step(x_gt_t, # cheating by looking at the current time step input. x_tm1, hids_tm1): pre_hids_t = T.dot(hids_tm1, self.whh) + T.dot( self.cheating_level * x_gt_t + (1.-self.cheating_level) * x_tm1, self.wxh) hids_t = pre_hids_t * (pre_hids_t > self.selectionthreshold) x_pred_t = T.dot(hids_t, self.whx) + self.bx return x_pred_t, hids_t (self._predictions, self.hids), self.updates = theano.scan( fn=step, sequences=self._input_frames, outputs_info=[self._input_frames[0], self.hids_0]) # set up output prediction if self.output_type == 'real': self._prediction = self._predictions[:, :, :self.numvis] elif self.output_type == 'binary': self._prediction = sigmoid(self._predictions[:, :, :self.numvis]) elif self.output_type == 'softmax': # softmax doesn't support 3d tensors, reshape batch and time axis # together, apply softmax and reshape back to 3d tensor self._prediction = T.nnet.softmax( self._predictions[:, :, :self.numvis].reshape(( self._predictions.shape[0] * self._predictions.shape[1], self.numvis )) ).reshape(( self._predictions.shape[0], self._predictions.shape[1], self.numvis )) else: raise ValueError('unsupported output_type') # set cost self._prediction_for_training = self._prediction[:self.numframes-1] if self.output_type == 'real': self._cost = T.mean(( self._prediction_for_training - self._input_frames[1:self.numframes] )**2) self._cost_varlen = T.mean(( self._prediction - self._input_frames[1:] )**2) elif self.output_type == 'binary': self._cost = -T.mean( self._input_frames[1:self.numframes] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[4:self.numframes]) * T.log( 1.0 - self._prediction)) self._cost_varlen = -T.mean( self._input_frames[1:] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:]) * T.log( 1.0 - self._prediction)) elif self.output_type == 'softmax': self._cost = -T.mean(T.log( self._prediction_for_training) * self._input_frames[1:self.numframes]) self._cost_varlen = -T.mean(T.log( self._prediction) * self._input_frames[1:]) # set gradients self._grads = T.grad(self._cost, self.params) # theano function for computing cost and grad self.cost = theano.function([self.inputs], self._cost, updates=self.updates) self.grads = theano.function([self.inputs], self._grads, updates=self.updates) # another set of variables # give some time steps of characters and free the model to predict for all the rest. self.inputs_var = T.fmatrix('inputs_var') self.nsteps = T.lscalar('nsteps') givens = {} givens[self.inputs] = T.concatenate( (self.inputs_var[:, :self.numvis], T.zeros((self.inputs_var.shape[0], self.nsteps*self.numvis)) ), axis=1) self.predict = theano.function( [self.inputs_var, theano.Param(self.nsteps, default=self.numframes-4)], self._prediction.transpose(1, 0, 2).reshape(( self.inputs_var.shape[0], self.nsteps*self.numvis)), updates=self.updates, givens=givens) def grad(self, x): def get_cudandarray_value(x): if type(x) == theano.sandbox.cuda.CudaNdarray: return np.array(x.__array__()).flatten() else: return x.flatten() return np.concatenate([get_cudandarray_value(g) for g in self.grads(x)]) def sample(self, numcases=1, numframes=10, temperature=1.0): assert self.output_type == 'softmax' next_prediction_and_state = theano.function( [self._input_frames, self.hids_0], [self.theano_rng.multinomial(pvals=T.nnet.softmax(self.x_pred_1/temperature)), self.hids_1] ) preds = np.zeros((numcases, numframes, self.numvis), dtype="float32") preds[:, 0, :] = self.numpy_rng.multinomial(numcases, pvals=np.ones(self.numvis)/np.float(self.numvis)) hids = np.zeros((numcases, self.numhid), dtype="float32") for t in range(1, numframes): nextpredandstate = next_prediction_and_state(preds[:,[t-1],:], hids) hids = nextpredandstate[1] preds[:,t,:] = nextpredandstate[0] return preds
class SRNN(Model): def __init__( self, name, # a string for identifying model. numvis, numsz, numrz, numsl, numrl, numframes, output_type='real', cheating_level=.0, # cheating by lookig at x_t (instead of x_tm1) numpy_rng=None, theano_rng=None): super(SRNN, self).__init__(name=name) # store arguments self.numvis = numvis self.numsz = numsz # stacked zae layer self.numrz = numrz # recurrent zae layer self.numsl = numsl # stacked linear layer self.numrl = numrl # recurrent linear layer self.numlayers = 3 # number of total stacked layers self.numrecur = 2 # number of recurrent connections self.numframes = numframes self.output_type = output_type self.selectionthreshold = 0.0 self.cheating_level = theano.shared(numpy.float32(cheating_level)) if not numpy_rng: self.numpy_rng = numpy.random.RandomState(1) else: self.numpy_rng = numpy_rng if not theano_rng: self.theano_rng = RandomStreams(1) else: self.theano_rng = theano_rng # create input var self.inputs = T.matrix(name='inputs') # set up params # recurrent connections: # train a pair of orthognal matrices fo each h->l->h connection. print "... generating random orthognal matrices" # def rand_ortho_np(shape, irange): # A = - irange + 2 * irange * np.random.rand(*shape) # U, s, V = np.linalg.svd(A, full_matrices=True) # return np.dot(U, np.dot( np.eye(U.shape[1], V.shape[0]), V )) # np.dot(aaa.T, aaa) = I assert self.numsz <= self.numrl eye = T.eye(self.numsz) var = theano.shared( self.numpy_rng.uniform(low=-numpy.sqrt(3. / self.numrl), high=numpy.sqrt(3. / self.numrl), size=(self.numsz, self.numrl)).astype( theano.config.floatX)) c = T.sum((T.dot(var, var.T) - eye)**2) grad = T.grad(c, wrt=var) train = theano.function([], c, updates=[(var, var - 0.1 * grad)]) i = numpy.inf while i > 1e-10: i = train() var0 = var.get_value() var.set_value( self.numpy_rng.uniform(low=-numpy.sqrt(3. / self.numrl), high=numpy.sqrt(3. / self.numrl), size=(self.numsz, self.numrl)).astype( theano.config.floatX)) i = numpy.inf while i > 1e-10: i = train() var1 = var.get_value() self.whl = [ theano.shared(value=var0, name='whl0'), theano.shared(value=var1, name='whl1') ] self.wlh = [ theano.shared(value=var0.T, name='wlh0'), theano.shared(value=var1.T, name='wlh1') ] del var print "Done." # vertical connections: self.whx = [ theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numsz, self.numvis)).astype(theano.config.floatX), name='whx0'), theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numsz, self.numvis)).astype(theano.config.floatX), name='whx1') ] self.wxh = [ theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numvis, self.numsz)).astype(theano.config.floatX), name='wxh0'), theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numsz, self.numsl)).astype(theano.config.floatX), name='wxh1'), theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numsl, self.numsz)).astype(theano.config.floatX), name='wxh2') ] self.bx = theano.shared( value=0.0 * numpy.ones(self.numvis, dtype=theano.config.floatX), name='bx') self.params = self.whl + self.wlh + self.whx + self.wxh + [self.bx] self._batchsize = self.inputs.shape[0] # reshape input var from 2D [ Bx(NxT) ] to 3D [ TxBxN ] (time, batch, numvis) self._input_frames = self.inputs.reshape( (self._batchsize, self.inputs.shape[1] // self.numvis, self.numvis)).transpose(1, 0, 2) # one-step prediction, used by sampling function self.hids_t0 = [ T.zeros((self._batchsize, self.numsz)), T.zeros((self._batchsize, self.numsl)), T.zeros((self._batchsize, self.numsz)) ] self.hids_t1 = [ ReLU( T.dot(T.dot(self.hids_t0[0], self.whl[0]), self.wlh[0]) + T.dot(self._input_frames[0], self.wxh[0])) ] self.hids_t1.append(T.dot(self.hids_t1[-1], self.wxh[1])) self.hids_t1.append( ReLU( T.dot(T.dot(self.hids_t0[2], self.whl[1]), self.wlh[1]) + T.dot(self.hids_t1[-1], self.wxh[2]))) self.x_pred_1 = self.bx + T.dot(self.hids_t1[0], self.whx[0]) + T.dot( self.hids_t1[2], self.whx[1]) # end of one-step prediction def step(x_tm1, hids_tm1): hids_tm1 = [ hids_tm1[:, :self.numsz], hids_tm1[:, self.numsz:(self.numsz + self.numsl)], hids_tm1[:, (self.numsz + self.numsl):(self.numsz * 2 + self.numsl)] ] hids_t = [ ReLU( T.dot(T.dot(hids_tm1[0], self.whl[0]), self.wlh[0]) + T.dot(x_tm1, self.wxh[0])) ] hids_t.append(T.dot(hids_t[-1], self.wxh[1])) hids_t.append( ReLU( T.dot(T.dot(hids_tm1[2], self.whl[1]), self.wlh[1]) + T.dot(hids_t[-1], self.wxh[2]))) x_pred_t = self.bx + T.dot(hids_t[0], self.whx[0]) + T.dot( hids_t[2], self.whx[1]) return x_pred_t, T.concatenate(hids_t, 1) (self._predictions, self.hids), self.updates = theano.scan( fn=step, sequences=self._input_frames[:49], outputs_info=[None, T.concatenate(self.hids_t0, 1)]) # set up output prediction if self.output_type == 'real': self._prediction = self._predictions[:, :, :self.numvis] elif self.output_type == 'binary': self._prediction = sigmoid(self._predictions[:, :, :self.numvis]) elif self.output_type == 'softmax': # softmax doesn't support 3d tensors, reshape batch and time axis # together, apply softmax and reshape back to 3d tensor self._prediction = T.nnet.softmax( self._predictions[:, :, :self.numvis].reshape( (self._predictions.shape[0] * self._predictions.shape[1], self.numvis))).reshape( (self._predictions.shape[0], self._predictions.shape[1], self.numvis)) else: raise ValueError('unsupported output_type') # set cost self._prediction_for_training = self._prediction[:self.numframes - 1] if self.output_type == 'real': self._cost = T.mean((self._prediction_for_training - self._input_frames[1:self.numframes])**2) self._cost_varlen = T.mean( (self._prediction - self._input_frames[1:])**2) elif self.output_type == 'binary': self._cost = -T.mean(self._input_frames[1:self.numframes] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[4:self.numframes]) * T.log(1.0 - self._prediction)) self._cost_varlen = -T.mean( self._input_frames[1:] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:]) * T.log(1.0 - self._prediction)) elif self.output_type == 'softmax': self._cost = -T.mean( T.log(self._prediction_for_training) * self._input_frames[1:self.numframes]) self._cost_varlen = -T.mean( T.log(self._prediction) * self._input_frames[1:]) # set gradients self._grads = T.grad(self._cost, self.params) # theano function for computing cost and grad self.cost = theano.function([self.inputs], self._cost, updates=self.updates) self.grads = theano.function([self.inputs], self._grads, updates=self.updates) # another set of variables # give some time steps of characters and free the model to predict for all the rest. self.inputs_var = T.fmatrix('inputs_var') self.nsteps = T.lscalar('nsteps') givens = {} givens[self.inputs] = T.concatenate( (self.inputs_var[:, :self.numvis], T.zeros((self.inputs_var.shape[0], self.nsteps * self.numvis))), axis=1) self.predict = theano.function( [ self.inputs_var, theano.Param(self.nsteps, default=self.numframes - 4) ], self._prediction.transpose(1, 0, 2).reshape( (self.inputs_var.shape[0], self.nsteps * self.numvis)), updates=self.updates, givens=givens) def grad(self, x): def get_cudandarray_value(x): if type(x) == theano.sandbox.cuda.CudaNdarray: return numpy.array(x.__array__()).flatten() else: return x.flatten() return numpy.concatenate( [get_cudandarray_value(g) for g in self.grads(x)]) def sample(self, numcases=1, numframes=10, temperature=1.0): assert self.output_type == 'softmax' next_prediction_and_state = theano.function( [self._input_frames, T.concatenate(self.hids_t0)], [ self.theano_rng.multinomial( pvals=T.nnet.softmax(self.x_pred_1 / temperature)), T.concatenate(self.hids_t1) ]) preds = numpy.zeros((numcases, numframes, self.numvis), dtype="float32") preds[:, 0, :] = self.numpy_rng.multinomial( numcases, pvals=numpy.ones(self.numvis) / numpy.float(self.numvis)) hids = numpy.zeros((numcases, self.numhid), dtype="float32") for t in range(1, numframes): nextpredandstate = next_prediction_and_state( preds[:, [t - 1], :], hids) hids = nextpredandstate[1] preds[:, t, :] = nextpredandstate[0] return preds
class CharacterRNN(ParameterModel): def __init__(self, name, n_input, n_output, n_hidden=10, n_layers=2, seed=None): super(CharacterRNN, self).__init__(name) self.n_hidden = n_hidden self.n_layers = n_layers self.n_input = n_input self.n_output = n_output self.lstm = MultilayerLSTM('%s-charrnn' % name, self.n_input, n_hidden=self.n_hidden, n_layers=self.n_layers, ) self.rng = RandomStreams(seed) self.output = Softmax('%s-softmax' % name, n_hidden, self.n_output) def save_parameters(self, location): state = { 'n_hidden': self.n_hidden, 'n_layers': self.n_layers, 'lstm': self.lstm.state(), 'output': self.output.state() } with open(location, 'wb') as fp: pickle.dump(state, fp) def load_parameters(self, location): with open(location, 'rb') as fp: state = pickle.load(fp) self.n_hidden = state['n_hidden'] self.n_layers = state['n_layers'] self.lstm.load(state['lstm']) self.output.load(state['output']) @theanify(T.tensor3('X'), T.tensor3('state'), T.tensor3('y'), returns_updates=True) def cost(self, X, state, y): (_, state, ypred), updates = self.forward(X, state) S, N, V = y.shape y = y.reshape((S * N, V)) ypred = ypred.reshape((S * N, V)) return (T.nnet.categorical_crossentropy(ypred, y).mean(), state), updates def forward(self, X, state): S, N, D = X.shape H = self.lstm.n_hidden L = self.lstm.n_layers O = self.output.n_output def step(input, previous_hidden, previous_state, previous_output): lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state) final_output = self.output.forward(lstm_hidden[:, -1, :], 1.0) return lstm_hidden, state, final_output hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) (encoder_output, encoder_state, softmax_output), updates = theano.scan(step, sequences=[X], outputs_info=[ hidden, state, T.alloc(np.asarray(0).astype(theano.config.floatX), N, O), ], n_steps=S) return (encoder_output, encoder_state, softmax_output), updates @theanify(T.vector('start_token'), T.iscalar('length'), T.scalar('temperature'), returns_updates=True) def generate(self, start_token, length, temperature): start_token = start_token[:, np.newaxis].T N = 1 H = self.lstm.n_hidden L = self.lstm.n_layers def step(input, previous_hidden, previous_state, temperature): lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state) final_output = self.output.forward(lstm_hidden[:, -1, :], temperature) sample = self.rng.multinomial(n=1, size=(1,), pvals=final_output, dtype=theano.config.floatX) return sample, lstm_hidden, state hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) (softmax_output, _, _), updates = theano.scan(step, outputs_info=[ start_token, hidden, state, ], non_sequences=[temperature], n_steps=length) return softmax_output[:, 0, :], updates @theanify(T.fvector('start_token'), T.fvector('concat'), T.iscalar('length'), T.fscalar('temperature'), returns_updates=True) def generate_with_concat(self, start_token, concat, length, temperature): start_token = start_token[:, np.newaxis].T concat = concat[:, np.newaxis].T N = 1 H = self.lstm.n_hidden L = self.lstm.n_layers def step(input, previous_hidden, previous_state, temperature, concat): lstm_hidden, state = self.lstm.forward(T.concatenate([input, concat], axis=1), previous_hidden, previous_state) final_output = self.output.forward(lstm_hidden[:, -1, :], temperature) sample = self.rng.multinomial(n=1, size=(1,), pvals=final_output, dtype=theano.config.floatX) return sample, lstm_hidden, state hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) (softmax_output, _, _), updates = theano.scan(step, outputs_info=[ start_token, hidden, state, ], non_sequences=[temperature, concat], n_steps=length) return softmax_output[:, 0, :], updates @theanify(T.tensor3('X'), returns_updates=True) def log_probability(self, X): S, N, D = X.shape H = self.lstm.n_hidden L = self.lstm.n_layers O = self.n_output def step(input, log_prob, previous_hidden, previous_state): lstm_hidden, state = self.lstm.forward(input, previous_hidden, previous_state) final_output = self.output.forward(lstm_hidden[:, -1, :], 1.0) return final_output, lstm_hidden, state hidden = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H), 1) start_log = T.alloc(np.array(0).astype(theano.config.floatX), N, O) state = T.unbroadcast(T.alloc(np.array(0).astype(theano.config.floatX), N, L, H)) (log_prob, _, _), updates = theano.scan(step, sequences=[X], outputs_info=[ start_log, hidden, state, ], n_steps=S) return log_prob, updates def get_parameters(self): return self.lstm.get_parameters() + self.output.get_parameters()
class SRNN(Model): def __init__(self, name, numvis, numhid, numlayers, numframes, output_type='real', dropout=0.0, numpy_rng=None, theano_rng=None): super(SRNN, self).__init__(name=name) self.numvis = numvis # frame length * alphabet size (1 * 27) self.numhid = numhid # 512 self.numlayers = numlayers # 3 self.numframes = numframes # maxnumframes (100) self.output_type = output_type # softmax self.dropout = dropout # 0.5 if not numpy_rng: self.numpy_rng = np.random.RandomState(1) else: self.numpy_rng = numpy_rng if not theano_rng: self.theano_rng = RandomStreams(1) else: self.theano_rng = theano_rng self.inputs = T.matrix(name='inputs') self.whh = [ theano.shared(value=np.eye(self.numhid).astype( theano.config.floatX), name='whh' + str(k)) for k in range(self.numlayers) ] self.whx = [ theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numhid, self.numvis)).astype(theano.config.floatX), name='whx' + str(k)) for k in range(self.numlayers) ] self.wxh = [ theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numvis, self.numhid)).astype(theano.config.floatX), name='wxh' + str(0)) ] self.wxh = self.wxh + [ theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(self.numhid, self.numhid)).astype( theano.config.floatX), name='wxh' + str(k)) for k in range(self.numlayers - 1) ] self.bx = theano.shared( value=0.0 * np.ones(self.numvis, dtype=theano.config.floatX), name='bx') self.bhid = [ theano.shared(value=0.0 * np.ones(self.numhid, dtype=theano.config.floatX), name='bhid' + str(k)) for k in range(self.numlayers) ] self.params = self.whh + self.whx + self.wxh + self.bhid + [self.bx] self._batchsize = self.inputs.shape[0] self._input_frames = self.inputs.reshape( (self._batchsize, self.inputs.shape[1] // self.numvis, self.numvis)).transpose(1, 0, 2) #1-step prediction --- self.hids_0 = T.zeros((self._batchsize, self.numhid * self.numlayers)) self.hids_1 = [ T.dot(self.hids_0[:, :self.numhid], self.whh[0]) + self.bhid[0] + T.dot(self._input_frames[0], self.wxh[0]) ] self.hids_1[0] *= (self.hids_1[0] > 0) for k in range(1, self.numlayers): self.hids_1.append( T.dot(self.hids_0[:, k * self.numhid:(k + 1) * self.numhid], self.whh[k]) + self.bhid[k] + T.dot(self.hids_1[k - 1], self.wxh[k])) self.hids_1[-1] *= (self.hids_1[-1] > 0) self.x_pred_1 = self.bx for k in range(self.numlayers): self.x_pred_1 += T.dot(self.hids_1[k], self.whx[k]) self.hids_1 = T.concatenate(self.hids_1, 1) #--- 1-step prediction def step_dropout(x_gt_t, dropoutmask, x_tm1, hids_tm1): hids_tm1 = [ hids_tm1[:, k * self.numhid:(k + 1) * self.numhid] for k in range(self.numlayers) ] pre_hids_t = [ T.dot(hids_tm1[0], self.whh[0]) + self.bhid[0] + T.dot(x_gt_t, self.wxh[0]) ] hids_t = [pre_hids_t[0] * (pre_hids_t[0] > 0)] for k in range(1, self.numlayers): pre_hids_t.append( T.dot(hids_tm1[k], self.whh[k]) + self.bhid[k] + T.dot(dropoutmask * hids_t[k - 1], (1.0 / self.dropout) * self.wxh[k])) hids_t.append(pre_hids_t[k] * (pre_hids_t[k] > 0)) x_pred_t = self.bx for k in range(self.numlayers): x_pred_t += T.dot(hids_t[k], self.whx[k]) return x_pred_t, T.concatenate(hids_t, 1) def step_nodropout(x_gt_t, x_tm1, hids_tm1): hids_tm1 = [ hids_tm1[:, k * self.numhid:(k + 1) * self.numhid] for k in range(self.numlayers) ] pre_hids_t = [ T.dot(hids_tm1[0], self.whh[0]) + self.bhid[0] + T.dot(x_gt_t, self.wxh[0]) ] hids_t = [pre_hids_t[0] * (pre_hids_t[0] > 0)] for k in range(1, self.numlayers): pre_hids_t.append( T.dot(hids_tm1[k], self.whh[k]) + self.bhid[k] + T.dot(hids_t[k - 1], self.wxh[k])) hids_t.append(pre_hids_t[k] * (pre_hids_t[k] > 0)) x_pred_t = self.bx for k in range(self.numlayers): x_pred_t += T.dot(hids_t[k], self.whx[k]) return x_pred_t, T.concatenate(hids_t, 1) if self.dropout == 0.0: (self._predictions, self.hids), self.updates = theano.scan( fn=step_nodropout, sequences=self._input_frames, outputs_info=[self._input_frames[0], self.hids_0]) else: self._dropoutmask = theano_rng.binomial( size=(self.inputs.shape[1] // self.numvis, self._batchsize, self.numhid), n=1, p=self.dropout, dtype=theano.config.floatX) (self._predictions, self.hids), self.updates = theano.scan( fn=step_dropout, sequences=[self._input_frames, self._dropoutmask], outputs_info=[self._input_frames[0], self.hids_0]) if self.output_type == 'real': self._prediction = self._predictions[:, :, :self. numvis] # dims: [time step, batch idx, numvis] elif self.output_type == 'binary': self._prediction = sigmoid(self._predictions[:, :, :self.numvis]) elif self.output_type == 'softmax': # softmax doesn't support 3d tensors, reshape batch and time axis # together, apply softmax and reshape back to 3d tensor self._prediction = T.nnet.softmax( self._predictions[:, :, :self.numvis].reshape( (self._predictions.shape[0] * self._predictions.shape[1], self.numvis))).reshape( (self._predictions.shape[0], self._predictions.shape[1], self.numvis)) else: raise ValueError('unsupported output_type') self._prediction_for_training = self._prediction[:self.numframes - 1] if self.output_type == 'real': self._cost = T.mean((self._prediction_for_training - self._input_frames[1:self.numframes])**2) self._cost_varlen = T.mean( (self._prediction - self._input_frames[1:])**2) # for various lengths elif self.output_type == 'binary': self._cost = -T.mean(self._input_frames[1:self.numframes] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:self.numframes]) * T.log(1.0 - self._prediction)) self._cost_varlen = -T.mean( self._input_frames[1:] * T.log(self._prediction_for_training) + (1.0 - self._input_frames[1:]) * T.log(1.0 - self._prediction)) elif self.output_type == 'softmax': self._cost = -T.mean( T.log(self._prediction_for_training) * self._input_frames[1:self.numframes]) self._cost_varlen = -T.mean( T.log(self._prediction) * self._input_frames[1:]) self._grads = T.grad(self._cost, self.params) self.inputs_var = T.fmatrix('inputs_var') self.nsteps = T.lscalar('nsteps') givens = {} givens[self.inputs] = T.concatenate( (self.inputs_var[:, :self.numvis], T.zeros((self.inputs_var.shape[0], self.nsteps * self.numvis))), axis=1) # predict given the first letters. self.predict = theano.function( [ self.inputs_var, theano.Param(self.nsteps, default=self.numframes - 4) ], self._prediction.transpose(1, 0, 2).reshape( (self.inputs_var.shape[0], self.nsteps * self.numvis)), updates=self.updates, givens=givens) self.cost = theano.function([self.inputs], self._cost, updates=self.updates) self.grads = theano.function([self.inputs], self._grads, updates=self.updates) def grad(self, x): def get_cudandarray_value(x): if type(x) == theano.sandbox.cuda.CudaNdarray: return np.array(x.__array__()).flatten() else: return x.flatten() return np.concatenate( [get_cudandarray_value(g) for g in self.grads(x)]) def sample(self, numcases=1, numframes=10, temperature=1.0): assert self.output_type == 'softmax' next_prediction_and_state = theano.function( [self._input_frames, self.hids_0], [ self.theano_rng.multinomial( pvals=T.nnet.softmax(self.x_pred_1 / temperature)), self.hids_1 ]) preds = np.zeros((numcases, numframes, self.numvis), dtype="float32") preds[:, 0, :] = self.numpy_rng.multinomial( numcases, pvals=np.ones(self.numvis) / np.float(self.numvis)) hids = np.zeros((numcases, self.numhid * self.numlayers), dtype="float32") for t in range(1, numframes): nextpredandstate = next_prediction_and_state( preds[:, [t - 1], :], hids) hids = nextpredandstate[1] preds[:, t, :] = nextpredandstate[0] return preds