def __init__( self, nvis, nhid, epsilon, batch_size, noise_scaling=1.0, lateral_x=False, lateral_h=False, debug=0, n_inference_steps=3, initial_noise=0.1, **kwargs ): super(FivEM, self).__init__(**kwargs) self.nvis = nvis self.nhid = nhid self.lateral_x = lateral_x self.lateral_h = lateral_h self.epsilon = epsilon self.batch_size = batch_size self.states_init = Constant(0) self.rho = Rho() self.noise_scaling = noise_scaling self.children = [self.rho] self.debug = debug self.n_inference_steps = n_inference_steps self.initial_noise = initial_noise
def _initialize(self): #TODO: know what to do after Blocks #740 is resolved: if self.recurrent_weights_init is None: self.recurrent_weights_init = self.weights_init if self.initial_states_init is None: self.initial_states_init = Constant(0.0) self.recurrent_weights_init.initialize(self.state_to_state, self.rng) state_to_update = self.weights_init.generate( self.rng, (self.dim, self.dim)) state_to_reset = self.weights_init.generate( self.rng, (self.dim, self.dim)) self.state_to_gates.set_value( numpy.hstack([state_to_update, state_to_reset])) self.initial_states_init.initialize(self.parameters.initial_state, self.rng)
from ali.algorithms import ali_algorithm from ali.bricks import (ALI, GaussianConditional, DeterministicConditional, XZJointDiscriminator) from ali.streams import create_handbags_shoes_data_streams from ali.utils import get_log_odds, conv_brick, conv_transpose_brick, bn_brick from ali.backup_model import BackupModel, PlotLoss, PlotAccuracy from ali.miriam_logger import Logger BATCH_SIZE = 100 MONITORING_BATCH_SIZE = 500 NUM_EPOCHS = 150 IMAGE_SIZE = (64, 64) NUM_CHANNELS = 3 NLAT = 256 GAUSSIAN_INIT = IsotropicGaussian(std=0.01) ZERO_INIT = Constant(0) LEARNING_RATE = 1e-4 BETA1 = 0.5 LEAK = 0.02 dropout = 0.4 def create_model_brick(): layers = [ conv_brick(2, 1, 64), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(7, 2, 128), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(5, 2, 256),
def setUp(self): self.lstm = LSTM(dim=3, weights_init=Constant(2), biases_init=Constant(0)) self.lstm.initialize()
class FivEM(Initializable, Random): """Implementation of the 5EM model. The model this brick represents is a simple bipartite, energy-based, undirected graphical model. Parameters ---------- nvis : int Number of visible units. nhid : int Number of hidden units. epsilon : float Step size. batch_size : int Batch size, used for initializing the persistent states h_prev and h. """ @lazy(allocation=["nvis", "nhid"]) def __init__( self, nvis, nhid, epsilon, batch_size, noise_scaling=1.0, lateral_x=False, lateral_h=False, debug=0, n_inference_steps=3, initial_noise=0.1, **kwargs ): super(FivEM, self).__init__(**kwargs) self.nvis = nvis self.nhid = nhid self.lateral_x = lateral_x self.lateral_h = lateral_h self.epsilon = epsilon self.batch_size = batch_size self.states_init = Constant(0) self.rho = Rho() self.noise_scaling = noise_scaling self.children = [self.rho] self.debug = debug self.n_inference_steps = n_inference_steps self.initial_noise = initial_noise def pp(self, var, name, level=1): if self.debug >= level: return theano.printing.Print(name)(var) else: return var def _allocate(self): Wxh = shared_floatx_nans((self.nvis, self.nhid), name="Wxh") self.parameters.append(Wxh) add_role(Wxh, WEIGHT) b = shared_floatx_nans((self.nhid), name="b") self.parameters.append(b) add_role(b, BIAS) c = shared_floatx_nans((self.nvis), name="c") self.parameters.append(c) add_role(c, BIAS) Whh = shared_floatx_nans((self.nhid, self.nhid), name="Whh") self.parameters.append(Whh) add_role(Whh, WEIGHT) Wxx = shared_floatx_nans((self.nvis, self.nvis), name="Wxx") self.parameters.append(Wxx) add_role(Wxx, WEIGHT) self.h = shared_floatx_nans((self.batch_size, self.nhid), name="h") x = tensor.matrix() h = tensor.matrix() self.generate_step_f = theano.function(inputs=[x, h], outputs=self.langevin_update(x, h, update_x=True)) def _initialize(self): Wxh, b, c, Whh, Wxx = self.parameters self.weights_init.initialize(Wxh, self.rng) self.biases_init.initialize(b, self.rng) self.biases_init.initialize(c, self.rng) self.weights_init.initialize(Whh, self.rng) self.weights_init.initialize(Wxx, self.rng) self.states_init.initialize(self.h, self.rng) @property def Wxh(self): return self.parameters[0] @property def b(self): return self.parameters[1] @property def c(self): return self.parameters[2] @property def Whh(self): return self.parameters[3] @property def Wxx(self): return self.parameters[4] def energy(self, x, h): """Computes the energy function. Parameters ---------- x : tensor variable Batch of visible states. h : tensor variable Batch of hidden states. """ rx = self.rho.apply(x) rh = self.rho.apply(h) energy = ( 0.5 * ((x * x).sum(axis=1) + (h * h).sum(axis=1)) - (tensor.dot(rx, tensor.tanh(self.Wxh)) * rh).sum(axis=1) - tensor.dot(rx, self.c) + tensor.dot(rh, self.b) ) if self.lateral_x: energy = energy + (tensor.dot(rx, tensor.tanh(self.Wxx)) * rx).sum(axis=1) if self.lateral_h: energy = energy + (tensor.dot(rh, tensor.tanh(self.Whh)) * rh).sum(axis=1) return energy def langevin_update(self, x, h, update_x=False): """Computes state updates according to Langevin dynamics. Parameters ---------- x : tensor variable Batch of visible states. h : tensor variable Batch of hidden states. update_x : bool, optional Whether to return updates for visible states as well. Defaults to `False`. """ if update_x: return ( self.corrupt(x) - self.epsilon * tensor.grad(self.energy(x, h).sum(), x), self.corrupt(h) - self.epsilon * tensor.grad(self.energy(x, h).sum(), h), ) else: return self.corrupt(h) - self.epsilon * tensor.grad(self.energy(x, h).sum(), h) def map_update(self, x, h): """Computes h update going down the energy gradient, given x. Parameters ---------- x : tensor variable Batch of visible states. h : tensor variable Batch of hidden states. """ return h - self.epsilon * tensor.grad(self.energy(x, h).sum(), h) def corrupt(self, var): """Adds zero-mean gaussian noise to the input variable. Parameters ---------- var : tensor variable Input. """ return var + 2 * self.epsilon * self.noise_scaling * self.theano_rng.normal(size=var.shape, dtype=var.dtype) @application(inputs=["given_x"], outputs=["value"]) def cost(self, given_x, application_call): """Computes the loss function. Parameters ---------- given_x : tensor variable Batch of given visible states from dataset. Notes ----- The `application_call` argument is an effect of the `application` decorator and isn't visible to users. It's used internally to set an updates dictionary for `h` that's discoverable by `ComputationGraph`. """ x = given_x h_prev = self.h + self.initial_noise * self.theano_rng.normal(size=self.h.shape, dtype=self.h.dtype) h = h_next = h_prev old_energy = self.pp(self.energy(x, h).sum(), "old_energy", 1) for iteration in range(self.n_inference_steps): h_prev = h h = h_next h_next = self.pp( disconnected_grad(self.langevin_update(self.pp(x, "x", 3), self.pp(h_next, "h", 2))), "h_next", 2 ) new_energy = self.pp(self.energy(x, h_next).sum(), "new_energy", 1) delta_energy = self.pp(old_energy - new_energy, "delta_energy", 1) old_energy = new_energy h_prediction_residual = ( h_next - self.pp(h_prev, "h_prev", 3) + self.epsilon * tensor.grad(self.energy(x, h_prev).sum(), h_prev) ) J_h = self.pp((h_prediction_residual * h_prediction_residual).sum(axis=1).mean(axis=0), "J_h", 1) x_prediction_residual = self.pp(tensor.grad(self.energy(given_x, h_prev).sum(), given_x), "x_residual", 2) J_x = self.pp((x_prediction_residual * x_prediction_residual).sum(axis=1).mean(axis=0), "J_x", 1) if self.debug > 1: application_call.add_auxiliary_variable(J_x, name="J_x" + str(iteration)) application_call.add_auxiliary_variable(J_h, name="J_h" + str(iteration)) if iteration == 0: total_cost = J_h + J_x else: total_cost = total_cost + J_h + J_x per_iteration_cost = total_cost / self.n_inference_steps updates = OrderedDict([(self.h, h_next)]) application_call.updates = dict_union(application_call.updates, updates) if self.debug > 0: application_call.add_auxiliary_variable(per_iteration_cost, name="per_iteration_cost") if self.debug > 1: application_call.add_auxiliary_variable(self.Wxh * 1.0, name="Wxh") application_call.add_auxiliary_variable(self.Whh * 1.0, name="Whh") application_call.add_auxiliary_variable(self.Wxx * 1.0, name="Wxx") application_call.add_auxiliary_variable(self.b * 1, name="b") application_call.add_auxiliary_variable(self.c * 1, name="c") return self.pp(total_cost, "total_cost")
('week_of_year', 52, 10), ('day_of_week', 7, 10), ('qhour_of_day', 24 * 4, 10), ('day_type', 3, 10), ] embed_weights_init = IsotropicGaussian(0.001) class MLPConfig(object): __slots__ = ('dim_input', 'dim_hidden', 'dim_output', 'weights_init', 'biases_init', 'embed_weights_init', 'dim_embeddings') prefix_encoder = MLPConfig() prefix_encoder.dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) prefix_encoder.dim_hidden = [100, 100] prefix_encoder.weights_init = IsotropicGaussian(0.01) prefix_encoder.biases_init = Constant(0.001) prefix_encoder.embed_weights_init = embed_weights_init prefix_encoder.dim_embeddings = dim_embeddings candidate_encoder = MLPConfig() candidate_encoder.dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) candidate_encoder.dim_hidden = [100, 100] candidate_encoder.weights_init = IsotropicGaussian(0.01) candidate_encoder.biases_init = Constant(0.001) candidate_encoder.embed_weights_init = embed_weights_init candidate_encoder.dim_embeddings = dim_embeddings representation_size = 100 representation_activation = Tanh normalize_representation = True
def train(self): x = self.sharedBatch['x'] x.name = 'x_myinput' x_mask = self.sharedBatch['x_mask'] x_mask.name = 'x_mask_myinput' y = self.sharedBatch['y'] y.name = 'y_myinput' if self.usePro: proportion = self.sharedBatch['pro'] proportion.name = 'pro' # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(self.input_dimx1, self.dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(self.dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(self.dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform, mask=x_mask) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) if self.usePro: cost = BinaryCrossEntropyProp().apply(y, y_hat, proportion) else: cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() self.f = theano.function(inputs=[], outputs=y_hat) self.lastH = theano.function(inputs=[], outputs=h[-1]) self.cg = ComputationGraph(cost) m = Model(cost) algorithm = GradientDescent(cost=cost, parameters=self.cg.parameters, step_rule=RMSProp(learning_rate=0.01), on_unused_sources='ignore') valid_monitor = DataStreamMonitoringShared( variables=[cost], data_stream=self.stream_valid_int, prefix="valid", sharedBatch=self.sharedBatch, sharedData=self.sharedData) train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) sharedVarMonitor = SwitchSharedReferences(self.sharedBatch, self.sharedData) tBest = self.track_best('valid_cost', self.cg) self.tracker = tBest[0] extensions = [sharedVarMonitor, valid_monitor] + tBest if self.debug: extensions.append(Printing()) self.algorithm = algorithm self.extensions = extensions self.model = m self.mainloop = MainLoop(self.algorithm, self.stream_train_int, extensions=self.extensions, model=self.model) self.main_loop(True)
def task_ID_layers(x, recurrent_in_size): mlp = MLP([Rectifier()] * (len(task_ID_FF_dims)-1), task_ID_FF_dims, name='task_ID_mlp', weights_init=Uniform(width=.2), biases_init=Constant(0)) mlp.push_initialization_config() mlp.initialize() out_size = task_ID_FF_dims[-1] + recurrent_in_size - len(game_tasks) zero_padded_task_IDs = T.concatenate([x[:,:,-len(game_tasks):], T.zeros((x.shape[0], x.shape[1], task_ID_FF_dims[0] - len(game_tasks)))], axis=2) mlp_out = mlp.apply(zero_padded_task_IDs) task_ID_out = T.concatenate([x[:,:,:-len(game_tasks)]] + [mlp_out], axis=2) return task_ID_out, out_size
k=k, name="emitter") source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout(readout_dim=hidden_size_recurrent, source_names=source_names, emitter=emitter, feedback_brick=feedback, name="readout") generator = SequenceGenerator(readout=readout, transition=transition, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.) generator.push_initialization_config() generator.transition.biases_init = IsotropicGaussian(0.01, 1) generator.transition.push_initialization_config() generator.initialize() states = {} states = generator.transition.apply.outputs states = { name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states }
def default_init(brick): brick.weights_init = Uniform(width=0.08) brick.biases_init = Constant(0) brick.initialize()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of language modeling with RNN", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument("prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument("state", nargs="?", default="", help="Changes to Groundhog state") parser.add_argument("--path", help="Path to a language dataset") parser.add_argument("--dict", help="Path to the dataset dictionary") parser.add_argument("--restart", help="Start anew") parser.add_argument("--reset", action="store_true", default=False, help="Reset the hidden state between batches") parser.add_argument("--steps", type=int, default=100, help="Number of steps to plot for the 'sample' mode" " OR training sequence length for the 'train' mode.") args = parser.parse_args() logger.debug("Args:\n" + str(args)) dim = 200 num_chars = 50 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator(LinearReadout( readout_dim=num_chars, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(num_chars, dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": batch_size = 1 seq_len = args.steps generator.initialize() # Build cost computation graph that uses the saved hidden states. # An issue: for Groundhog this is completely transparent, that's # why it does not carry the hidden state over the period when # validation in done. We should find a way to fix in the future. x = tensor.lmatrix('x') init_states = shared_floatx_zeros((batch_size, dim), name='init_states') reset = tensor.scalar('reset') cost = ComputationGraph( generator.cost(x, states=init_states * reset).sum()) # TODO: better search routine states = [ v for v in cost.variables if hasattr(v.tag, 'application_call') and v.tag.application_call.brick == generator.transition and (v.tag.application_call.application == generator.transition.apply) and v.tag.role == VariableRole.OUTPUT and v.tag.name == 'states' ] assert len(states) == 1 states = states[0] gh_model = GroundhogModel(generator, cost) gh_model.properties.append( ('bpc', cost.outputs[0] * numpy.log(2) / seq_len)) gh_model.properties.append(('mean_init_state', init_states.mean())) gh_model.properties.append(('reset', reset)) if not args.reset: gh_model.updates.append((init_states, states[-1])) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() changes = eval("dict({})".format(args.state)) state.update(changes) def output_format(x, y, reset): return dict(x=x[:, None], reset=reset) train, valid, test = [ LMIterator(batch_size=batch_size, use_infinite_loop=mode == 'train', path=args.path, seq_len=seq_len, mode=mode, chunks='chars', output_format=output_format, can_fit=True) for mode in ['train', 'valid', 'test'] ] trainer = SGD(gh_model, state, train) state['on_nan'] = 'warn' state['cutoff'] = 1. main_loop = MainLoop(train, valid, None, gh_model, trainer, state, None) if not args.restart: main_loop.load() main_loop.main() elif args.mode == "sample": load_params(generator, args.prefix + "model.npz") chars = numpy.load(args.dict)['unique_chars'] sample = ComputationGraph( generator.generate(n_steps=args.steps, batch_size=10, iterate=True)).function() states, outputs, costs = sample() for i in range(10): print("Generation cost: {}".format(costs[:, i].sum())) print("".join([chars[o] for o in outputs[:, i]])) else: assert False
def main(config): print('working on it ...') # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Set up training model logger.info("Building model") training_model = Model(cost) # Extensions extensions = [] # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up beam search and sampling computation graphs if necessary if config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs''' # Add sampling logger.info("Building sampler") global samplers_ob samplers_ob=Sampler(model=search_model, data_stream=input_sentence_mask, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size']) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=None, data_stream=None, extensions=extensions ) for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training')
class GatedRecurrent(BaseRecurrent, Initializable): u"""Gated recurrent neural network. Gated recurrent neural network (GRNN) as introduced in [CvMG14]_. Every unit of a GRNN is equipped with update and reset gates that facilitate better gradient propagation. Parameters ---------- dim : int The dimension of the hidden state. activation : :class:`.Brick` or None The brick to apply as activation. If ``None`` a :class:`.Tanh` brick is used. gate_activation : :class:`.Brick` or None The brick to apply as activation for gates. If ``None`` a :class:`.Logistic` brick is used. Notes ----- See :class:`.Initializable` for initialization parameters. .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio, *Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734. """ @lazy(allocation=['dim']) def __init__(self, dim, activation=None, gate_activation=None, **kwargs): self.dim = dim self.recurrent_weights_init = None self.initial_states_init = None if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation children = [activation, gate_activation] + kwargs.get('children', []) super(GatedRecurrent, self).__init__(children=children, **kwargs) @property def state_to_state(self): return self.parameters[0] @property def state_to_gates(self): return self.parameters[1] @property def initial_states_(self): return self.parameters[2] def get_dim(self, name): if name == 'mask': return 0 if name in ['inputs', 'states']: return self.dim if name == 'gate_inputs': return 2 * self.dim return super(GatedRecurrent, self).get_dim(name) def _allocate(self): self.parameters.append( shared_floatx_nans((self.dim, self.dim), name='state_to_state')) add_role(self.parameters[-1], WEIGHT) self.parameters.append( shared_floatx_nans((self.dim, 2 * self.dim), name='state_to_gates')) add_role(self.parameters[-1], WEIGHT) self.parameters.append( shared_floatx_nans((self.dim, ), name="initial_state")) add_role(self.parameters[-1], INITIAL_STATE) def _initialize(self): #TODO: know what to do after Blocks #740 is resolved: if self.recurrent_weights_init is None: self.recurrent_weights_init = self.weights_init if self.initial_states_init is None: self.initial_states_init = Constant(0.0) self.recurrent_weights_init.initialize(self.state_to_state, self.rng) state_to_update = self.weights_init.generate(self.rng, (self.dim, self.dim)) state_to_reset = self.weights_init.generate(self.rng, (self.dim, self.dim)) self.state_to_gates.set_value( numpy.hstack([state_to_update, state_to_reset])) self.initial_states_init.initialize(self.parameters.initial_state, self.rng) @recurrent(sequences=['mask', 'inputs', 'gate_inputs'], states=['states'], outputs=['states'], contexts=[]) def apply(self, inputs, gate_inputs, states, mask=None): """Apply the gated recurrent transition. Parameters ---------- states : :class:`~tensor.TensorVariable` The 2 dimensional matrix of current states in the shape (batch_size, dim). Required for `one_step` usage. inputs : :class:`~tensor.TensorVariable` The 2 dimensional matrix of inputs in the shape (batch_size, dim) gate_inputs : :class:`~tensor.TensorVariable` The 2 dimensional matrix of inputs to the gates in the shape (batch_size, 2 * dim). mask : :class:`~tensor.TensorVariable` A 1D binary array in the shape (batch,) which is 1 if there is data available, 0 if not. Assumed to be 1-s only if not given. Returns ------- output : :class:`~tensor.TensorVariable` Next states of the network. """ gate_values = self.gate_activation.apply( states.dot(self.state_to_gates) + gate_inputs) update_values = gate_values[:, :self.dim] reset_values = gate_values[:, self.dim:] states_reset = states * reset_values next_states = self.activation.apply( states_reset.dot(self.state_to_state) + inputs) next_states = (next_states * update_values + states * (1 - update_values)) if mask: next_states = (mask[:, None] * next_states + (1 - mask[:, None]) * states) return next_states @application(outputs=apply.states) def initial_states(self, batch_size, *args, **kwargs): return [ tensor.repeat(self.parameters.initial_state[None, :], batch_size, 0) ]
def __init__(self, ref_data, output_dim): if pca_dims is not None: covmat = numpy.dot(ref_data.T, ref_data) ev, evec = numpy.linalg.eig(covmat) best_i = ev.argsort()[-pca_dims:] best_evecs = evec[:, best_i] best_evecs = best_evecs / numpy.sqrt( (best_evecs**2).sum(axis=0)) #normalize ref_data = numpy.dot(ref_data, best_evecs) input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') # Construct the model j = tensor.lvector('j') r = ref_data_sh[j, :] x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[input_dim] + hidden_dims + [n_inter], name='inter_gen') mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp') inter_weights = mlp.apply(r) if inter_bias == None: ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = ibias.apply(tensor.dot(x, inter_weights)) else: inter = tensor.dot(x, inter_weights) - inter_bias inter = inter_act_fun.apply(inter) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) # error_rate = tensor.neq(y, pred).mean() ber = balanced_error_rate.ber(y, pred) # Initialize parameters for brick in [mlp, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, ber]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if x_dropout != 0: cg = apply_dropout(cg, [x], x_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights]))) - set([inter_weights])) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars)) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if r_noise_std != 0: cg = apply_noise(cg, [r], r_noise_std) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, ber_reg] = cg.outputs if s_l1pen != 0: s_weights = VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + s_l1pen * sum( abs(w).sum() for w in s_weights) if i_l1pen != 0: cost_reg = cost_reg + i_l1pen * abs(inter).sum() if a_l1pen != 0: a_weights = VariableFilter(bricks=mlp2.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + a_l1pen * sum( abs(w).sum() for w in a_weights) self.cost = cost self.cost_reg = cost_reg self.ber = ber self.ber_reg = ber_reg self.pred = pred self.confidence = confidence
def main_rnn(config): x = tensor.tensor3('features') y = tensor.matrix('targets') # if 'LSTM' in config['model'] : # from models import getLSTMstack # y_hat = getLSTMstack(input_dim=13, input_var=x, depth=int(config['model'][-1])) # else : # raise Exception("These are not the LSTM we are looking for") # y_hat = model.apply(x) emitter = TestEmitter() # emitter = TrivialEmitter(readout_dim=config['lstm_hidden_size']) # cost_func = SquaredError() # @application # def qwe(self, readouts, outputs=None): # print(type(self), type(readouts)) # x = cost_func.apply(readouts,outputs) # return x print(type(emitter.cost)) # emitter.cost = qwe # print(type(qwe)) steps = 2 n_samples= config['target_size'] transition = [LSTM(config['lstm_hidden_size']) for _ in range(4)] transition = RecurrentStack(transition, name="transition", skip_connections=False) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout(emitter, readout_dim=config['lstm_hidden_size'], source_names=source_names,feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None) seqgen = SequenceGenerator(readout, transition, attention=None, add_contexts=False) seqgen.weights_init = IsotropicGaussian(0.01) seqgen.biases_init = Constant(0.) seqgen.push_initialization_config() seqgen.transition.biases_init = IsotropicGaussian(0.01,1) seqgen.transition.push_initialization_config() seqgen.initialize() states = seqgen.transition.apply.outputs print('states',states) states = {name: shared_floatx_zeros((n_samples, config['lstm_hidden_size'])) for name in states} cost_matrix = seqgen.cost_matrix(x, **states) cost = cost_matrix.mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) #Cost # cost = SquaredError().apply(y_hat ,y) #cost = CategoricalCrossEntropy().apply(T.flatten(),Y) # #for sampling #cg = ComputationGraph(seqgen.generate(n_steps=steps,batch_size=n_samples, iterate=True)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=config['learning_rate'])) #Getting the stream train_stream = MFCC.get_stream(config['batch_size'],config['source_size'],config['target_size'],config['num_examples']) #Monitoring stuff extensions = [Timing(), FinishAfter(after_n_batches=config['num_batches']), #DataStreamMonitoring([cost, error_rate],test_stream,prefix="test"), TrainingDataMonitoring([cost], prefix="train", every_n_batches=1), #Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=1)] main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) main_loop.run()
def main(save_to, hist_file): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) mnist_test = MNIST(("test", ), sources=['features', 'targets']) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs).copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) model = Model([error_rate, confusion]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outs = OrderedDict( (full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(model.variables)) # Load histogram information with open(hist_file, 'rb') as handle: histograms = pickle.load(handle) # Corpora mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Probe the given layer target_layer = '/lenet/mlp/linear_0' next_layer_param = '/lenet/mlp/linear_1.W' sample = extract_sample(outs[target_layer], mnist_test_stream) print('sample shape', sample.shape) # Figure neurons to ablate hist = histograms[('linear_1', 'b')] targets = [i for i in range(hist.shape[1]) if hist[2, i] * hist[7, i] < 0] print('ablating', len(targets), ':', targets) # Now adjust the next layer weights based on the probe param = model.get_parameter_dict()[next_layer_param] print('param shape', param.get_value().shape) new_weights = ablate_inputs(targets, sample, param.get_value(), compensate=False) param.set_value(new_weights) # Evaluation pass evaluator = DatasetEvaluator([error_rate, confusion]) print(evaluator.evaluate(mnist_test_stream))
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator(LinearReadout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean(generator.cost(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop(algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[ FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring( [cost], prefix="this_step", after_every_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), SerializeMainLoop(save_path, every_n_batches=500), Printing(every_n_batches=100) ]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph( generator.generate(n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets", )) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost(chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies, ) = VariableFilter(applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations, ) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy( abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, parameter in parameters.items(): observables.append(named_copy(parameter.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[parameter].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring(observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_parameter_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter(bricks=[reverser.generator], name="outputs")(ComputationGraph( generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(samples) outputs, costs = beam_search.search({chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def construct_model(input_dim, out_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') nx = x.shape[0] nj = x.shape[1] # also is r.shape[0] nr = r.shape[1] # r is nj x nr # x is nx x nj # y is nx # r_rep is nx x nj x nr r_rep = r[None, :, :].repeat(axis=0, repeats=nx) # x3 is nx x nj x 1 x3 = x[:, :, None] # concat is nx x nj x (nr + 1) concat = tensor.concatenate([r_rep, x3], axis=2) # Change concat from Batch x Time x Features to T X B x F rnn_input = concat.dimshuffle(1, 0, 2) if use_ensembling: # Split time dimension into batches of size num_feats # Join that dimension with the B dimension ens_shape = (num_feats, rnn_input.shape[0] / num_feats, rnn_input.shape[1]) rnn_input = rnn_input.reshape(ens_shape + (input_dim + 1, )) rnn_input = rnn_input.reshape( (ens_shape[0], ens_shape[1] * ens_shape[2], input_dim + 1)) linear = Linear(input_dim=input_dim + 1, output_dim=4 * hidden_dim, name="input_linear") lstm = LSTM(dim=hidden_dim, activation=activation_function, name="hidden_recurrent") top_linear = Linear(input_dim=hidden_dim, output_dim=out_dim, name="out_linear") pre_rnn = linear.apply(rnn_input) states = lstm.apply(pre_rnn)[0] activations = top_linear.apply(states) if use_ensembling: activations = activations.reshape(ens_shape + (out_dim, )) # Unsplit batches (ensembling) activations = tensor.mean(activations, axis=1) # Mean over time activations = tensor.mean(activations, axis=0) cost = Softmax().categorical_cross_entropy(y, activations) pred = activations.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in (linear, lstm, top_linear): brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() # apply noise cg = ComputationGraph([cost, error_rate]) noise_vars = VariableFilter(roles=[WEIGHT])(cg) apply_noise(cg, noise_vars, noise_std) [cost_reg, error_rate_reg] = cg.outputs return cost_reg, error_rate_reg, cost, error_rate
def main(self): import itertools import numpy from theano import tensor from blocks.algorithms import Adam from blocks.bricks import MLP, Rectifier, Identity, LinearMaxout, Linear from blocks.bricks.bn import BatchNormalization from blocks.bricks.sequences import Sequence from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring from blocks.extensions.saveload import Checkpoint from blocks.graph import ComputationGraph, apply_dropout from blocks.graph.bn import (batch_normalization, get_batch_normalization_updates) from blocks.filter import VariableFilter from blocks.initialization import IsotropicGaussian, Constant from blocks.model import Model from blocks.main_loop import MainLoop from blocks.roles import INPUT from ali.algorithms import ali_algorithm from ali.streams import create_gaussian_mixture_data_streams from ali.bricks import (ALI, COVConditional, DeterministicConditional, XZJointDiscriminator) from ali.utils import as_array from blocks.select import Selector import logging import argparse from pacgan.extensions import ModelLogger, GraphLogger, MetricLogger import fuel from math import cos, sin seed = random.randint(1, 100000) fuelrc_path = os.path.join(self._work_dir, ".fuelrc") f = open(fuelrc_path, "w") f.write("default_seed: {}\n".format(seed)) f.close() fuel.config.default_seed = seed INPUT_DIM = 2 NLAT = 2 GEN_HIDDEN = 400 DISC_HIDDEN = 200 GEN_ACTIVATION = Rectifier MAXOUT_PIECES = 5 GAUSSIAN_INIT = IsotropicGaussian(std=0.02) ZERO_INIT = Constant(0.0) NUM_EPOCHS = 400 LEARNING_RATE = 1e-4 BETA1 = 0.8 BATCH_SIZE = 100 MONITORING_BATCH_SIZE = 500 MEANS = [ numpy.array( [cos(id * numpy.pi * 2 / 8), sin(id * numpy.pi * 2 / 8)]) for id in range(8) ] VARIANCES = [0.01**2 * numpy.eye(len(mean)) for mean in MEANS] PRIORS = None def create_model_brick(): encoder_mapping = MLP( dims=[2 * INPUT_DIM, GEN_HIDDEN, GEN_HIDDEN, NLAT], activations=[ Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='encoder_h1'), Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='encoder_h2'), Identity(name='encoder_out') ], use_bias=False, name='encoder_mapping') encoder = COVConditional(encoder_mapping, (INPUT_DIM, ), name='encoder') decoder_mapping = MLP(dims=[ NLAT, GEN_HIDDEN, GEN_HIDDEN, GEN_HIDDEN, GEN_HIDDEN, INPUT_DIM ], activations=[ Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='decoder_h1'), Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='decoder_h2'), Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='decoder_h3'), Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='decoder_h4'), Identity(name='decoder_out') ], use_bias=False, name='decoder_mapping') decoder = DeterministicConditional(decoder_mapping, name='decoder') x_discriminator = Identity(name='x_discriminator') z_discriminator = Identity(name='z_discriminator') joint_discriminator = Sequence(application_methods=[ LinearMaxout(input_dim=INPUT_DIM + NLAT, output_dim=DISC_HIDDEN, num_pieces=MAXOUT_PIECES, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='discriminator_h1').apply, LinearMaxout(input_dim=DISC_HIDDEN, output_dim=DISC_HIDDEN, num_pieces=MAXOUT_PIECES, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='discriminator_h2').apply, LinearMaxout(input_dim=DISC_HIDDEN, output_dim=DISC_HIDDEN, num_pieces=MAXOUT_PIECES, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='discriminator_h3').apply, Linear(input_dim=DISC_HIDDEN, output_dim=1, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='discriminator_out').apply ], name='joint_discriminator') discriminator = XZJointDiscriminator(x_discriminator, z_discriminator, joint_discriminator, name='discriminator') ali = ALI(encoder=encoder, decoder=decoder, discriminator=discriminator, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='ali') ali.push_allocation_config() encoder_mapping.linear_transformations[-1].use_bias = True decoder_mapping.linear_transformations[-1].use_bias = True ali.initialize() print("Number of parameters in discriminator: {}".format( numpy.sum([ numpy.prod(v.shape.eval()) for v in Selector( ali.discriminator).get_parameters().values() ]))) print("Number of parameters in encoder: {}".format( numpy.sum([ numpy.prod(v.shape.eval()) for v in Selector(ali.encoder).get_parameters().values() ]))) print("Number of parameters in decoder: {}".format( numpy.sum([ numpy.prod(v.shape.eval()) for v in Selector(ali.decoder).get_parameters().values() ]))) return ali def create_models(): ali = create_model_brick() x = tensor.matrix('features') z = ali.theano_rng.normal(size=(x.shape[0], NLAT)) def _create_model(with_dropout): cg = ComputationGraph(ali.compute_losses(x, z)) if with_dropout: inputs = VariableFilter(bricks=ali.discriminator. joint_discriminator.children[1:], roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.5) inputs = VariableFilter( bricks=[ali.discriminator.joint_discriminator], roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) return Model(cg.outputs) model = _create_model(with_dropout=False) with batch_normalization(ali): bn_model = _create_model(with_dropout=False) pop_updates = list( set( get_batch_normalization_updates(bn_model, allow_duplicates=True))) bn_updates = [(p, m * 0.05 + p * 0.95) for p, m in pop_updates] return model, bn_model, bn_updates def create_main_loop(): model, bn_model, bn_updates = create_models() ali, = bn_model.top_bricks discriminator_loss, generator_loss = bn_model.outputs step_rule = Adam(learning_rate=LEARNING_RATE, beta1=BETA1) algorithm = ali_algorithm(discriminator_loss, ali.discriminator_parameters, step_rule, generator_loss, ali.generator_parameters, step_rule) algorithm.add_updates(bn_updates) streams = create_gaussian_mixture_data_streams( batch_size=BATCH_SIZE, monitoring_batch_size=MONITORING_BATCH_SIZE, means=MEANS, variances=VARIANCES, priors=PRIORS) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams bn_monitored_variables = ([ v for v in bn_model.auxiliary_variables if 'norm' not in v.name ] + bn_model.outputs) monitored_variables = ( [v for v in model.auxiliary_variables if 'norm' not in v.name] + model.outputs) extensions = [ Timing(), FinishAfter(after_n_epochs=NUM_EPOCHS), DataStreamMonitoring(bn_monitored_variables, train_monitor_stream, prefix="train", updates=bn_updates), DataStreamMonitoring(monitored_variables, valid_monitor_stream, prefix="valid"), Checkpoint(os.path.join(self._work_dir, "main_loop.tar"), after_epoch=True, after_training=True, use_cpickle=True), ProgressBar(), Printing(), #ModelLogger(folder=self._work_dir, after_epoch=True), GraphLogger(num_modes=1, num_samples=2500, dimension=2, r=0, std=1, folder=self._work_dir, after_epoch=True, after_training=True), MetricLogger(means=MEANS, variances=VARIANCES, folder=self._work_dir, after_epoch=True) ] main_loop = MainLoop(model=bn_model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) return main_loop main_loop = create_main_loop() main_loop.run()
def initialize(to_init, weights_init=Uniform(width=0.08), biases_init=Constant(0)): for bricks in to_init: bricks.weights_init = weights_init bricks.biases_init = biases_init bricks.initialize()
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel): image_size, channels, data_train, data_valid, data_test = datasets.get_data( dataset) train_stream = Flatten( DataStream.default_stream(data_train, iteration_scheme=SequentialScheme( data_train.num_examples, batch_size))) valid_stream = Flatten( DataStream.default_stream(data_valid, iteration_scheme=SequentialScheme( data_valid.num_examples, batch_size))) test_stream = Flatten( DataStream.default_stream(data_test, iteration_scheme=SequentialScheme( data_test.num_examples, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = channels * img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # Configure attention mechanism if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * channels * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, channels=channels, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, channels=channels, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- if name is None: name = dataset # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = name + "-" + time.strftime("%Y%m%d-%H%M%S") longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % ( dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') x_recons, kl_terms = draw.reconstruct(x) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), PartsOnlyCheckpoint("{}/{}".format(subdir, name), before_training=True, after_epoch=True, save_separately=['log', 'model']), SampleCheckpoint(image_size=image_size[0], channels=channels, save_subdir=subdir, before_training=True, after_epoch=True), # Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def check_constant(const, shape, ground_truth): # rng unused, so pass None. init = Constant(const).generate(None, ground_truth.shape) assert_(ground_truth.dtype == theano.config.floatX) assert_(ground_truth.shape == init.shape) assert_equal(ground_truth, init)
import data from model.time_mlp import Model, Stream n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory dim_embeddings = [ ('origin_call', data.origin_call_train_size, 10), ('origin_stand', data.stands_size, 10), ('week_of_year', 52, 10), ('day_of_week', 7, 10), ('qhour_of_day', 24 * 4, 10), ('day_type', 3, 10), ('taxi_id', 448, 10), ] dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) dim_hidden = [500, 100] dim_output = 1 embed_weights_init = IsotropicGaussian(0.001) mlp_weights_init = IsotropicGaussian(0.01) mlp_biases_init = Constant(0.001) exp_base = 1.5 learning_rate = 0.00001 momentum = 0.99 batch_size = 32 max_splits = 100
def test_attention_recurrent(): rng = numpy.random.RandomState(1234) dim = 5 batch_size = 4 input_length = 20 attended_dim = 10 attended_length = 15 wrapped = SimpleRecurrent(dim, Identity()) attention = SequenceContentAttention( state_names=wrapped.apply.states, attended_dim=attended_dim, match_dim=attended_dim) recurrent = AttentionRecurrent(wrapped, attention, seed=1234) recurrent.weights_init = IsotropicGaussian(0.5) recurrent.biases_init = Constant(0) recurrent.initialize() attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") inputs = tensor.tensor3("inputs") inputs_mask = tensor.matrix("inputs_mask") outputs = recurrent.apply( inputs=inputs, mask=inputs_mask, attended=attended, attended_mask=attended_mask) states, glimpses, weights = outputs assert states.ndim == 3 assert glimpses.ndim == 3 assert weights.ndim == 3 # For values. def rand(size): return rng.uniform(size=size).astype(theano.config.floatX) # For masks. def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=theano.config.floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask input_vals = rand((input_length, batch_size, dim)) input_mask_vals = generate_mask(input_length, batch_size) attended_vals = rand((attended_length, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_length, batch_size) func = theano.function([inputs, inputs_mask, attended, attended_mask], [states, glimpses, weights]) states_vals, glimpses_vals, weight_vals = func( input_vals, input_mask_vals, attended_vals, attended_mask_vals) assert states_vals.shape == (input_length, batch_size, dim) assert glimpses_vals.shape == (input_length, batch_size, attended_dim) assert (len(ComputationGraph(outputs).shared_variables) == len(Selector(recurrent).get_parameters())) # weights for not masked position must be zero assert numpy.all(weight_vals * (1 - attended_mask_vals.T) == 0) # weights for masked positions must be non-zero assert numpy.all(abs(weight_vals + (1 - attended_mask_vals.T)) > 1e-5) # weights from different steps should be noticeably different assert (abs(weight_vals[0] - weight_vals[1])).sum() > 1e-2 # weights for all state after the last masked position should be same for i in range(batch_size): last = int(input_mask_vals[:, i].sum()) for j in range(last, input_length): assert_allclose(weight_vals[last, i], weight_vals[j, i], 1e-5) # freeze sums assert_allclose(weight_vals.sum(), input_length * batch_size, 1e-5) assert_allclose(states_vals.sum(), 113.429, rtol=1e-5) assert_allclose(glimpses_vals.sum(), 415.901, rtol=1e-5)
act = Rectifier elif activation_function == 'tanh': act = Tanh elif activation_function == 'sigmoid': act = Logistic elif activation_function == 'softplus': act = Softplus layers_act = [act('layer_%d' % i) for i in range(len(hidden_size))] NADE_CF_model = tabula_NADE(activations=layers_act, input_dim0=input_dim0, input_dim1=input_dim1, C_dim=C_dim, other_dims=hidden_size, batch_size=batch_size, weights_init=Uniform(std=0.05), biases_init=Constant(0.0) ) NADE_CF_model.push_initialization_config() dims = [input_dim0] + hidden_size + [input_dim0] linear_layers = [layer for layer in NADE_CF_model.children if 'linear' in layer.name] assert len(linear_layers) == len(dims) - 1 for i in range(len(linear_layers)): H1 = dims[i] H2 = dims[i + 1] width = 2 * np.sqrt(6) / np.sqrt(H1 + H2) # std = np.sqrt(2. / dim) linear_layers[i].weights_init = Uniform(width=width) # NADE_CF_model.children[0].weights_init = Constant(1)
from blocks.roles import WEIGHT from blocks.graph import ComputationGraph from blocks.filter import VariableFilter cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' from blocks.bricks import MLP mlp = MLP(activations=[Rectifier(), Softmax()], dims=[784, 100, 10]).apply(x) from blocks.initialization import IsotropicGaussian, Constant input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() from fuel.datasets import MNIST mnist = MNIST(("train",)) from fuel.streams import DataStream from fuel.schemes import SequentialScheme from fuel.transformers import Flatten data_stream = Flatten(DataStream.default_stream( mnist,
def build_and_run(label, config): ############## CREATE THE NETWORK ############### #Define the parameters num_epochs, num_batches, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation = config[ 'num_epochs'], config['num_batches'], config['num_channels'], config[ 'image_shape'], config['filter_size'], config[ 'num_filter'], config['pooling_sizes'], config[ 'mlp_hiddens'], config['output_size'], config[ 'batch_size'], config['activation'], config[ 'mlp_activation'] # print(num_epochs, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation) lambda_l1 = 0.000025 lambda_l2 = 0.000025 print("Building model") #Create the symbolics variable x = T.tensor4('image_features') y = T.lmatrix('targets') #Get the parameters conv_parameters = zip(filter_size, num_filter) #Create the convolutions layers conv_layers = list( interleave([(Convolutional(filter_size=filter_size, num_filters=num_filter, name='conv_{}'.format(i)) for i, (filter_size, num_filter) in enumerate(conv_parameters)), (activation), (MaxPooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) # (AveragePooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) #Create the sequence conv_sequence = ConvolutionalSequence(conv_layers, num_channels, image_size=image_shape, weights_init=Uniform(width=0.2), biases_init=Constant(0.)) #Initialize the convnet conv_sequence.initialize() #Add the MLP top_mlp_dims = [np.prod(conv_sequence.get_dim('output')) ] + mlp_hiddens + [output_size] out = Flattener().apply(conv_sequence.apply(x)) mlp = MLP(mlp_activation, top_mlp_dims, weights_init=Uniform(0, 0.2), biases_init=Constant(0.)) #Initialisze the MLP mlp.initialize() #Get the output predict = mlp.apply(out) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') ########### REGULARIZATION ################## cg = ComputationGraph([cost]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) biases = VariableFilter(roles=[BIAS])(cg.variables) # # l2_penalty_weights = T.sum([i*lambda_l2/len(weights) * (W ** 2).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer l2_penalty = T.sum([ lambda_l2 * (W**2).sum() for i, W in enumerate(weights + biases) ]) # Gradually increase penalty for layer # # #l2_penalty_bias = T.sum([lambda_l2*(B **2).sum() for B in biases]) # # #l2_penalty = l2_penalty_weights + l2_penalty_bias l2_penalty.name = 'l2_penalty' l1_penalty = T.sum([lambda_l1 * T.abs_(z).sum() for z in weights + biases]) # l1_penalty_weights = T.sum([i*lambda_l1/len(weights) * T.abs_(W).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer # l1_penalty_biases = T.sum([lambda_l1 * T.abs_(B).sum() for B in biases]) # l1_penalty = l1_penalty_biases + l1_penalty_weights l1_penalty.name = 'l1_penalty' costreg = cost + l2_penalty + l1_penalty costreg.name = 'costreg' ########### DEFINE THE ALGORITHM ############# # algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum()) algorithm = GradientDescent(cost=costreg, parameters=cg.parameters, step_rule=Adam()) ########### GET THE DATA ##################### istest = 'test' in config.keys() train_stream, valid_stream, test_stream = get_stream(batch_size, image_shape, test=istest) ########### INITIALIZING EXTENSIONS ########## checkpoint = Checkpoint('models/best_' + label + '.tar') checkpoint.add_condition( ['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Adding a live plot with the bokeh server plot = Plot( label, channels=[ ['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], # ['train_costreg','train_grad_norm']], # [ 'train_costreg', 'train_total_gradient_norm', 'train_l2_penalty', 'train_l1_penalty' ] ], server_url="http://hades.calculquebec.ca:5042") grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = 'grad_norm' extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring([cost, error_rate, error_rate2], valid_stream, prefix="valid"), TrainingDataMonitoring([ costreg, error_rate, error_rate2, grad_norm, l2_penalty, l1_penalty ], prefix="train", after_epoch=True), plot, ProgressBar(), Printing(), TrackTheBest('valid_error_rate', min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=4) ] # Early-stopping model = Model(cost) main_loop = MainLoop(algorithm, data_stream=train_stream, model=model, extensions=extensions) main_loop.run()
def setUp(self): self.simple = SimpleRecurrent(dim=3, weights_init=Constant(2), activation=Tanh()) self.simple.initialize()
name="lstm") else: lstm = LSTM(activation=Tanh(), dim=h_dim, bias=bias, name="lstm") h, c = lstm.apply(x_transform) h_to_o = Linear(name='h_to_o', input_dim=h_dim, output_dim=o_dim) o = h_to_o.apply(h) o = NDimensionalSoftmax().apply(o, extra_ndim=1) for brick in (lstm, x_to_h, h_to_o): brick.weights_init = Glorot() brick.biases_init = Constant(0) brick.initialize() cost = CategoricalCrossEntropy().apply(y, o) cost.name = 'CE' print 'Bulding training process...' shapes = [] for param in ComputationGraph(cost).parameters: # shapes.append((param.name, param.eval().shape)) shapes.append(np.prod(list(param.eval().shape))) print "Total number of parameters: " + str(np.sum(shapes)) if not os.path.exists(save_path): os.makedirs(save_path) log_path = save_path + '/log.txt'
def create_model(self): input_dim = self.input_dim x = self.x y = self.y p = self.p mask = self.mask hidden_dim = self.hidden_dim embedding_dim = self.embedding_dim lookup = LookupTable(self.dict_size, embedding_dim, weights_init=IsotropicGaussian(0.001), name='LookupTable') x_to_h = Linear(embedding_dim, hidden_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) lstm = LSTM(hidden_dim, name='lstm', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) h_to_o = MLP([Logistic()], [hidden_dim, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0), name='h_to_o') lookup.initialize() x_to_h.initialize() lstm.initialize() h_to_o.initialize() embed = lookup.apply(x).reshape( (x.shape[0], x.shape[1], self.embedding_dim)) embed.name = "embed_vec" x_transform = x_to_h.apply(embed.transpose(1, 0, 2)) x_transform.name = "Transformed X" self.lookup = lookup self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o #if mask is None: h, c = lstm.apply(x_transform) #else: #h, c = lstm.apply(x_transform, mask=mask) h.name = "hidden_state" c.name = "cell state" # only values of hidden units of the last timeframe are used for # the classification indices = T.sum(mask, axis=0) - 1 rel_hid = h[indices, T.arange(h.shape[1])] out = self.h_to_o.apply(rel_hid) probs = 1 - out probs.name = "probability" y = y.dimshuffle(0, 'x') # Create the if-else cost function pos_ex = (y * probs) / p neg_ex = (1 - y) * (1 - probs) / np.float32(1 - p) reward = pos_ex + neg_ex cost = reward # Negative of reward cost.name = "cost" return cost
def create_layers(layer_spec, data_dim, deterministic_layers=0, deterministic_act=None, deterministic_size=1.): """ Parameters ---------- layer_spec : str A specification for the layers to construct; typically takes a string like "100,50,25,10" and create P- and Q-models with 4 hidden layers of specified size. data_dim : int Dimensionality of the trainig/test data. The bottom-most layers will work with thgis dimension. deterministic_layers : int Dont want to talk about it. deterministic_act : deterministic_size : float Returns ------- p_layers : list List of ProbabilisticLayers with a ProbabilisticTopLayer on top. q_layers : list List of ProbabilisticLayers """ inits = { 'weights_init': RWSInitialization(factor=1.), # 'weights_init': IsotropicGaussian(0.1), 'biases_init': Constant(-1.0), } m = re.match("(\d*\.?\d*)x-(\d+)l-(\d+)", layer_spec) if m: first = int(data_dim * float(m.groups()[0])) last = float(m.groups()[2]) n_layers = int(m.groups()[1]) base = numpy.exp(numpy.log(first / last) / (n_layers - 1)) layer_sizes = [data_dim] + [ int(last * base**i) for i in reversed(range(n_layers)) ] print(layer_sizes) else: layer_specs = [i for i in layer_spec.split(",")] layer_sizes = [data_dim] + [int(i) for i in layer_specs] p_layers = [] q_layers = [] for l, (size_lower, size_upper) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): """ if size_upper < 0: lower_before_repeat = size_lower p = BernoulliLayer( MLP([Sigmoid()], [size_lower, size_lower], **rinits), name="p_layer%d"%l) q = BernoulliLayer( MLP([Sigmoid()], [size_lower, size_lower], **rinits), name="q_layer%d"%l) for r in xrange(-size_upper): p_layers.append(p) q_layers.append(q) continue elif size_lower < 0: size_lower = lower_before_repeat """ size_mid = (deterministic_size * (size_upper + size_lower)) // 2 p_layers.append( BernoulliLayer(MLP( [deterministic_act() for i in range(deterministic_layers)] + [Logistic()], [size_upper] + [size_mid for i in range(deterministic_layers)] + [size_lower], **inits), name="p_layer%d" % l)) q_layers.append( BernoulliLayer(MLP( [deterministic_act() for i in range(deterministic_layers)] + [Logistic()], [size_lower] + [size_mid for i in range(deterministic_layers)] + [size_upper], **inits), name="q_layer%d" % l)) p_layers.append( BernoulliTopLayer(layer_sizes[-1], name="p_top_layer", **inits)) return p_layers, q_layers
def testing_init(brick): brick.weights_init = Identity() brick.biases_init = Constant(0) brick.initialize()
class GatedRecurrent(BaseRecurrent, Initializable): u"""Gated recurrent neural network. Gated recurrent neural network (GRNN) as introduced in [CvMG14]_. Every unit of a GRNN is equipped with update and reset gates that facilitate better gradient propagation. Parameters ---------- dim : int The dimension of the hidden state. activation : :class:`.Brick` or None The brick to apply as activation. If ``None`` a :class:`.Tanh` brick is used. gate_activation : :class:`.Brick` or None The brick to apply as activation for gates. If ``None`` a :class:`.Logistic` brick is used. Notes ----- See :class:`.Initializable` for initialization parameters. .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio, *Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734. """ @lazy(allocation=['dim']) def __init__(self, dim, activation=None, gate_activation=None, **kwargs): super(GatedRecurrent, self).__init__(**kwargs) self.dim = dim self.recurrent_weights_init = None self.initial_states_init = None if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation self.children = [activation, gate_activation] @property def state_to_state(self): return self.parameters[0] @property def state_to_gates(self): return self.parameters[1] @property def initial_states_(self): return self.parameters[2] def get_dim(self, name): if name == 'mask': return 0 if name in ['inputs', 'states']: return self.dim if name == 'gate_inputs': return 2 * self.dim return super(GatedRecurrent, self).get_dim(name) def _allocate(self): self.parameters.append(shared_floatx_nans((self.dim, self.dim), name='state_to_state')) add_role(self.parameters[-1], WEIGHT) self.parameters.append(shared_floatx_nans((self.dim, 2 * self.dim), name='state_to_gates')) add_role(self.parameters[-1], WEIGHT) self.parameters.append(shared_floatx_nans((self.dim,), name="initial_state")) add_role(self.parameters[-1], INITIAL_STATE) def _initialize(self): #TODO: know what to do after Blocks #740 is resolved: if self.recurrent_weights_init is None: self.recurrent_weights_init = self.weights_init if self.initial_states_init is None: self.initial_states_init = Constant(0.0) self.recurrent_weights_init.initialize(self.state_to_state, self.rng) state_to_update = self.weights_init.generate( self.rng, (self.dim, self.dim)) state_to_reset = self.weights_init.generate( self.rng, (self.dim, self.dim)) self.state_to_gates.set_value( numpy.hstack([state_to_update, state_to_reset])) self.initial_states_init.initialize(self.parameters.initial_state, self.rng) @recurrent(sequences=['mask', 'inputs', 'gate_inputs'], states=['states'], outputs=['states'], contexts=[]) def apply(self, inputs, gate_inputs, states, mask=None): """Apply the gated recurrent transition. Parameters ---------- states : :class:`~tensor.TensorVariable` The 2 dimensional matrix of current states in the shape (batch_size, dim). Required for `one_step` usage. inputs : :class:`~tensor.TensorVariable` The 2 dimensional matrix of inputs in the shape (batch_size, dim) gate_inputs : :class:`~tensor.TensorVariable` The 2 dimensional matrix of inputs to the gates in the shape (batch_size, 2 * dim). mask : :class:`~tensor.TensorVariable` A 1D binary array in the shape (batch,) which is 1 if there is data available, 0 if not. Assumed to be 1-s only if not given. Returns ------- output : :class:`~tensor.TensorVariable` Next states of the network. """ gate_values = self.gate_activation.apply( states.dot(self.state_to_gates) + gate_inputs) update_values = gate_values[:, :self.dim] reset_values = gate_values[:, self.dim:] states_reset = states * reset_values next_states = self.activation.apply( states_reset.dot(self.state_to_state) + inputs) next_states = (next_states * update_values + states * (1 - update_values)) if mask: next_states = (mask[:, None] * next_states + (1 - mask[:, None]) * states) return next_states @application(outputs=apply.states) def initial_states(self, batch_size, *args, **kwargs): return [tensor.repeat(self.parameters.initial_state[None, :], batch_size, 0)]