def test_imocld_mnist(step_type='add', attention=False): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] write_dim = 300 enc_dim = 300 dec_dim = 300 mix_dim = 20 z_dim = 100 n_iter = 16 rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } att_tag = "NA" # attention not implemented yet # setup the reader and writer (shared by primary and guide policies) read_dim = 2*x_dim # dimension of output from reader_mlp reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], \ name="writer_mlp", **inits) # mlps for setting conditionals over z_mix mix_var_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ name="mix_var_mlp", **inits) mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ name="mix_enc_mlp", **inits) # mlp for decoding z_mix into a distribution over initial LSTM states mix_dec_mlp = MLP([Tanh(), Tanh()], \ [mix_dim, 250, (2*enc_dim + 2*dec_dim + 2*enc_dim + mix_dim)], \ name="mix_dec_mlp", **inits) # mlps for processing inputs to LSTMs var_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4*enc_dim], \ name="var_mlp_in", **inits) enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4*enc_dim], \ name="enc_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [ z_dim, 4*dec_dim], \ name="dec_mlp_in", **inits) # mlps for turning LSTM outputs into conditionals over z_gen var_mlp_out = CondNet([], [enc_dim, z_dim], name="var_mlp_out", **inits) enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits) # LSTMs for the actual LSTMs (obviously, perhaps) var_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \ name="enc_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, \ name="dec_rnn", **rnninits) draw = IMoCLDrawModels( n_iter, step_type=step_type, # step_type can be 'add' or 'jump' reader_mlp=reader_mlp, writer_mlp=writer_mlp, mix_enc_mlp=mix_enc_mlp, mix_dec_mlp=mix_dec_mlp, mix_var_mlp=mix_var_mlp, enc_mlp_in=enc_mlp_in, enc_mlp_out=enc_mlp_out, enc_rnn=enc_rnn, dec_mlp_in=dec_mlp_in, dec_rnn=dec_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn) draw.initialize() # build the cost gradients, training function, samplers, etc. draw.build_model_funcs() # sample several interchangeable versions of the model conditions = [{'occ_dim': 0, 'drop_prob': 0.8}, \ {'occ_dim': 16, 'drop_prob': 0.0}] for cond_dict in conditions: occ_dim = cond_dict['occ_dim'] drop_prob = cond_dict['drop_prob'] dp_int = int(100.0 * drop_prob) draw.load_model_params(f_name="TBCLM_IMP_MNIST_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag)) # draw some independent samples from the model Xva = row_shuffle(Xva) Xb = to_fX(Xva[:128]) _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=None) Xb = np.repeat(Xb, 2, axis=0) Mb = np.repeat(Mb, 2, axis=0) samples, _ = draw.do_sample(Xb, Mb) # save the samples to a pkl file, in their numpy array form sample_pkl_name = "IMP-MNIST-OD{0:d}-DP{1:d}-{2:s}.pkl".format(occ_dim, dp_int, step_type) f_handle = file(sample_pkl_name, 'wb') cPickle.dump(samples, f_handle, protocol=-1) f_handle.close() print("Saved some samples in: {}".format(sample_pkl_name)) return
def build_model_vanilla(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [ SimpleRecurrent(dim=state_dim, activation=Tanh()) for _ in range(layers) ] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear(input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # We have # h = [state, state_1, state_2 ...] if layers > 1 # h = state if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} if layers > 1: # Save all the last states for d in range(layers): last_states[d] = h[d][-1, :, :] if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: last_states[0] = h[-1, :, :] h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def build_model_lstm(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [LSTM(dim=args.state_dim, activation=Tanh()) for _ in range(args.layers)] rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(mask=x_mask, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} hidden_states = [] for d in range(args.layers): # TODO correct bug # h[5 * d] = h[5 * d] * x_mask # h[5 * d + 1] = h[5 * d + 1] * x_mask last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] h[5 * d].name = "hidden_state_" + str(d) h[5 * d + 1].name = "hidden_cell_" + str(d) hidden_states.extend([h[5 * d], h[5 * d + 1]]) # The updates of the hidden states # Note: if we have mask, then updating initial state # with last state does not make sence anymore. updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) updates.append((inits[1][d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = {"in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates} h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if args.layers > 1 # h = [state] if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if args.layers > 1: if args.skip_connections or args.skip_output: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state_all" presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, gate_values, hidden_states
def setUp(self): self.simple = SimpleRecurrent(dim=3, weights_init=Constant(2), activation=Tanh()) self.simple.initialize()
length=train_dataset.syllables_vocab_size() + 1, dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup_input.initialize() linear_input = Linear(name='linear_input', input_dim=hidden_layer_dim, output_dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_input.initialize() rnn = SimpleRecurrent(name='hidden', dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear(name='linear_output', input_dim=hidden_layer_dim, output_dim=train_dataset.durations_vocab_size(), weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden)
def build_fork_lookup(vocab_size, time_length, args): x = tensor.lmatrix('features') virtual_dim = 6 state_dim = 6 skip_connections = False layers = 1 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper transitions = [ClockworkBase(dim=state_dim, activation=Tanh(), period=2 ** i) for i in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) # Give time as the first index for each element in the list: # (Time X Batch X embedding_dim) if layers > 1 and skip_connections: for t in range(len(pre_rnn)): pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2) else: pre_rnn = pre_rnn.dimshuffle(1, 0, 2) f_pre_rnn = theano.function([x], pre_rnn) # Prepare inputs for the RNN kwargs = OrderedDict() for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] else: kwargs['inputs' + suffix] = pre_rnn print kwargs # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() f_h = theano.function([x], h) return f_pre_rnn, f_h
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError( 'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)' ) z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, ) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 4, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d' % l, ogates_zoneout=ogates_zoneout) for l in range(num_layers) ] elif rnn_type.lower() == 'gru': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 3, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d' % l) for l in range(num_layers) ] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d' % l) for l in range(num_layers) ] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply( rnn_embedding, zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost / np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states' % l in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states' % l: norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x * x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [ cost_train, cost, bpc, perp, learning_rate, aggregation.mean( algorithm.total_gradient_norm).copy("gradient_norm_mean") ] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append( RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value() / lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError( 'Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
def train(): if os.path.isfile('trainingdata.tar'): with open('trainingdata.tar', 'rb') as f: main = load(f) else: hidden_size = 512 filename = 'warpeace.hdf5' encoder = HDF5CharEncoder('warpeace_input.txt', 1000) encoder.write(filename) alphabet_len = encoder.length x = theano.tensor.lmatrix('x') readout = Readout( readout_dim=alphabet_len, feedback_brick=LookupFeedback(alphabet_len, hidden_size, name='feedback'), source_names=['states'], emitter=RandomSoftmaxEmitter(), name='readout' ) transition = GatedRecurrent( activation=Tanh(), dim=hidden_size) transition.weights_init = IsotropicGaussian(0.01) gen = SequenceGenerator(readout=readout, transition=transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='sequencegenerator') gen.push_initialization_config() gen.initialize() cost = gen.cost(outputs=x) cost.name = 'cost' cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(0.5)) train_set = encoder.get_dataset() train_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme( train_set.num_examples, batch_size=128)) main = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(), Printing(), Checkpoint('trainingdata.tar', every_n_epochs=10), ShowOutput(every_n_epochs=10) ]) main.run()
def __init__(self, vocab_size, topicWord_size, embedding_dim, state_dim, topical_dim, representation_dim, match_function='SumMacthFunction', use_doubly_stochastic=False, lambda_ds=0.001, use_local_attention=False, window_size=10, use_step_decay_cost=False, use_concentration_cost=False, lambda_ct=10, use_stablilizer=False, lambda_st=50, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.topicWord_size = topicWord_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRU(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') self.energy_computer = globals()[match_function](name='energy_comp') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, energy_computer=self.energy_computer, use_local_attention=use_local_attention, window_size=window_size, name="attention") self.topical_attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=topical_dim, match_dim=state_dim, energy_computer=self.energy_computer, use_local_attention=use_local_attention, window_size=window_size, name="topical_attention" ) #not sure whether the match dim would be correct. # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim, name='readout') # calculate the readout of topic word, # no specific feedback brick, use the trival feedback break # no post_merge and merge, use Bias and Linear topicWordReadout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.topicWord_size, emitter=SoftmaxEmitter( initial_output=-1, theano_seed=theano_seed), name='twReadout') # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, topicWordReadout=topicWordReadout, topic_vector_names=['topicSumVector'], transition=self.transition, attention=self.attention, topical_attention=self.topical_attention, q_dim=self.state_dim, #q_name='topic_embedding', topical_name='topic_embedding', content_name='content_embedding', use_step_decay_cost=use_step_decay_cost, use_doubly_stochastic=use_doubly_stochastic, lambda_ds=lambda_ds, use_concentration_cost=use_concentration_cost, lambda_ct=lambda_ct, use_stablilizer=use_stablilizer, lambda_st=lambda_st, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def main(mode, save_path, num_batches, from_dump): if mode == "train": # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Data processing pipeline data_stream = DataStreamMapping( mapping=lambda data: tuple(array.T for array in data), data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=lambda data: len(data[0]) <= 100, data_stream=OneBillionWord( "training", [99], char2code, level="character", preprocess=str.lower).get_default_stream()))))) # Build the model chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") encoder = Bidirectional(GatedRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) encoder.initialize() fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.fork_dims = {name: dimension for name in fork.fork_names} fork.initialize() lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) lookup.initialize() transition = Transition(activation=Tanh(), dim=dimension, attended_dim=2 * dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() bricks = [encoder, fork, lookup, generator] # Give an idea of what's going on params = Selector(bricks).get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Build the cost computation graph batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) energies = unpack(VariableFilter(application=readout.readout, name="output")(cg.variables), singleton=True) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy(activations.mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule([ GradientClipping(10.0), SteepestDescent(0.01) ])) observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) main_loop = MainLoop( model=bricks, data_stream=data_stream, algorithm=algorithm, extensions=([LoadFromDump(from_dump)] if from_dump else []) + [ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), TrainingDataMonitoring( observables, prefix="average", every_n_batches=10), FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", lambda log: math.isnan( log.current_row.total_gradient_norm)), Plot(os.path.basename(save_path), [["average_" + cost.name], ["average_" + cost_per_character.name]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": with open(save_path, "rb") as source: encoder, fork, lookup, generator = dill.load(source) logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) sample_function = ComputationGraph(generated).get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=lambda tuple_: -tuple_[0]) for _, message in messages: print(message)
def __init__(self, emb_dim, dim, num_input_words, num_output_words, vocab, proximity_coef=0, proximity_distance='l2', encoder='lstm', decoder='lstm', shared_rnn=False, translate_layer=None, word_dropout=0., tied_in_out=False, vocab_keys=None, seed=0, reconstruction_coef=1., provide_targets=False, **kwargs): """ translate_layer: either a string containing the activation function to use either a list containg the list of activations for a MLP """ if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if num_output_words == 0: num_output_words = vocab.size() self._word_dropout = word_dropout self._tied_in_out = tied_in_out if not encoder: if proximity_coef: raise ValueError("Err: meaningless penalty term (no encoder)") if not vocab_keys: raise ValueError("Err: specify a key vocabulary (no encoder)") if tied_in_out and num_input_words != num_output_words: raise ValueError("Can't tie in and out embeddings. Different " "vocabulary size") if shared_rnn and (encoder != 'lstm' or decoder != 'lstm'): raise ValueError( "can't share RNN because either encoder or decoder" "is not an RNN") if shared_rnn and decoder == 'lstm_c': raise ValueError( "can't share RNN because the decoder takes different" "inputs") if word_dropout < 0 or word_dropout > 1: raise ValueError("invalid value for word dropout", str(word_dropout)) if proximity_distance not in ['l1', 'l2', 'cos']: raise ValueError( "unrecognized distance: {}".format(proximity_distance)) if proximity_coef and emb_dim != dim and not translate_layer: raise ValueError( """if proximity penalisation, emb_dim should equal dim or there should be a translate layer""") if encoder not in [ None, 'lstm', 'bilstm', 'mean', 'weighted_mean', 'max_bilstm', 'bilstm_sum', 'max_bilstm_sum' ]: raise ValueError('encoder not recognized') if decoder not in ['skip-gram', 'lstm', 'lstm_c']: raise ValueError('decoder not recognized') self._proximity_distance = proximity_distance self._decoder = decoder self._encoder = encoder self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._proximity_coef = proximity_coef self._reconstruction_coef = reconstruction_coef self._provide_targets = provide_targets self._word_to_id = WordToIdOp(self._vocab) if vocab_keys: self._key_to_id = WordToIdOp(vocab_keys) children = [] if encoder or (not encoder and decoder in ['lstm', 'lstm_c']): self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') children.append(self._main_lookup) if provide_targets: # this is useful to simulate Hill's baseline without pretrained embeddings # in the encoder, only as targets for the encoder. self._target_lookup = LookupTable(self._num_input_words, emb_dim, name='target_lookup') children.append(self._target_lookup) if not encoder: self._key_lookup = LookupTable(vocab_keys.size(), emb_dim, name='key_lookup') children.append(self._key_lookup) elif encoder == 'lstm': self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder in ['bilstm', 'max_bilstm']: # dim is the dim of the concatenated vector self._encoder_fork = Linear(emb_dim, 2 * dim, name='encoder_fork') self._encoder_rnn = Bidirectional(LSTM(dim / 2, name='encoder_rnn')) children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder in ['bilstm_sum', 'max_bilstm_sum']: self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = BidirectionalSum(LSTM(dim, name='encoder_rnn')) children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder == 'mean': pass elif encoder == 'weighted_mean': self._encoder_w = MLP([Logistic()], [dim, 1], name="encoder_weights") children.extend([self._encoder_w]) else: raise NotImplementedError() if decoder in ['lstm', 'lstm_c']: dim_after_translate = emb_dim if shared_rnn: self._decoder_fork = self._encoder_fork self._decoder_rnn = self._encoder_rnn else: if decoder == 'lstm_c': dim_2 = dim + emb_dim else: dim_2 = dim self._decoder_fork = Linear(dim_2, 4 * dim, name='decoder_fork') self._decoder_rnn = LSTM(dim, name='decoder_rnn') children.extend([self._decoder_fork, self._decoder_rnn]) elif decoder == 'skip-gram': dim_after_translate = emb_dim self._translate_layer = None activations = {'sigmoid': Logistic(), 'tanh': Tanh(), 'linear': None} if translate_layer: if type(translate_layer) == str: translate_layer = [translate_layer] assert (type(translate_layer) == list) activations_translate = [activations[a] for a in translate_layer] dims_translate = [ dim, ] * len(translate_layer) + [dim_after_translate] self._translate_layer = MLP(activations_translate, dims_translate, name="translate_layer") children.append(self._translate_layer) if not self._tied_in_out: self._pre_softmax = Linear(emb_dim, self._num_output_words) children.append(self._pre_softmax) if decoder in ['lstm', 'lstm_c']: self._softmax = NDimensionalSoftmax() elif decoder in ['skip-gram']: self._softmax = Softmax() children.append(self._softmax) super(Seq2Seq, self).__init__(children=children, **kwargs)
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, attention_strategy='content', attention_sources='s', readout_sources='sfa', memory='none', memory_size=500, seq_len=50, init_strategy='last', theano_seed=None, **kwargs): """Creates a new decoder brick without embedding. Args: vocab_size (int): Target language vocabulary size embedding_dim (int): Size of feedback embedding layer state_dim (int): Number of hidden units representation_dim (int): Dimension of source annotations attention_strategy (string): Which attention should be used cf. ``_initialize_attention`` attention_sources (string): Defines the sources used by the attention model 's' for decoder states, 'f' for feedback readout_sources (string): Defines the sources used in the readout network. 's' for decoder states, 'f' for feedback, 'a' for attention (context vector) memory (string): Which external memory should be used (cf. ``_initialize_attention``) memory_size (int): Size of the external memory structure seq_len (int): Maximum sentence length init_strategy (string): How to initialize the RNN state (cf. ``GRUInitialState``) theano_seed: Random seed """ super(NoLookupDecoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRUInitialState(attended_dim=state_dim, init_strategy=init_strategy, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism self.attention, src_names = _initialize_attention( attention_strategy, seq_len, self.transition, representation_dim, state_dim, attention_sources, readout_sources, memory, memory_size) # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout( source_names=src_names, readout_dim=embedding_dim, emitter=NoLookupEmitter(initial_output=-1, readout_dim=embedding_dim, cost_brick=SquaredError()), # cost_brick=CategoricalCrossEntropy()), feedback_brick=TrivialFeedback(output_dim=embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Logistic(name='softmax1').apply ]), merged_dim=state_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
from dataset import Corpus, createDataset args = getArguments() corpus = Corpus(open(args.corpus).read()) train_data, vocab_size = createDataset(corpus=corpus, sequence_length=750, repeat=20) if args.mode == "train": seq_len = 100 dim = 100 feedback_dim = 100 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout( readout_dim=vocab_size, source_names=["states"], # transition.apply.states ??? emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating a Markov chain with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument("prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument("--steps", type=int, default=100, help="Number of steps to plot") args = parser.parse_args() dim = 10 num_states = ChainIterator.num_states feedback_dim = 8 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator(LinearReadout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": rng = numpy.random.RandomState(1) batch_size = 50 generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.debug("transition.weights_init={}".format( transition.weights_init)) cost = generator.cost(tensor.lmatrix('x')).sum() gh_model = GroundhogModel(generator, cost) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() data = ChainIterator(rng, 100, batch_size) trainer = SGD(gh_model, state, data) main_loop = MainLoop(data, None, None, gh_model, trainer, state, None) main_loop.main() elif args.mode == "sample": load_params(generator, args.prefix + "model.npz") sample = ComputationGraph( generator.generate(n_steps=args.steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainIterator.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainIterator.trans_prob)) else: assert False
def test_integer_sequence_generator(): """Test a sequence generator with integer outputs. Such sequence generators can be used to e.g. model language. """ rng = numpy.random.RandomState(1234) readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) generator = SequenceGenerator(Readout( readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, feedback_dim)), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.lmatrix('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 costs_fun = theano.function([y, mask], [costs]) y_test = rng.randint(readout_dim, size=(n_steps, batch_size)) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = costs_fun(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 482.827, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 16.0942, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join( [generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [ el for el in var_filter(cg.variables) if el.name == aux_var_name ][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 1.60942, rtol=1e-5) # Test generate states, outputs, costs = generator.generate(iterate=True, batch_size=batch_size, n_steps=n_steps) cg = ComputationGraph(states + outputs + costs) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=cg.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size) assert_allclose(states_val.sum(), -17.91811, rtol=1e-5) assert_allclose(costs_val.sum(), 482.863, rtol=1e-5) assert outputs_val.sum() == 630 # Test masks agnostic results of cost cost1 = costs_fun([[1], [2]], [[1], [1]])[0] cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0] assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
from blocks.algorithms import Momentum, AdaDelta, RMSProp, CompositeRule, BasicMomentum, RemoveNotFinite, StepClipping from blocks.bricks import Tanh, Softmax, Linear, MLP from blocks.bricks.recurrent import LSTM from blocks.bricks.lookup import LookupTable from blocks.initialization import IsotropicGaussian, Constant from blocks.filter import VariableFilter from blocks.roles import WEIGHT from blocks.graph import ComputationGraph, apply_noise, apply_dropout name = 'RNN' couches = 1 input_dim = 1 out_dim = 1 hidden_dim = 64 activation_function = Tanh() activation_function_name = 'Tanh' batch_size = 100 w_noise_std = 0.01 i_dropout = 0.5 proportion_train = 0.9 algo = 'RMS' learning_rate_value = 1e-5 momentum_value = 0.9 decay_rate_value = 0 StepClipping_value = 2 step_rule = CompositeRule([RMSProp(learning_rate=learning_rate_value), #decay_rate=decay_rate_value, BasicMomentum(momentum=momentum_value), StepClipping(StepClipping_value)]) print_freq = 1000
def test_sequence_generator(): """Test a sequence generator with no contexts and continuous outputs. Such sequence generators can be used to model e.g. dynamical systems. """ rng = numpy.random.RandomState(1234) output_dim = 1 dim = 20 batch_size = 30 n_steps = 10 transition = SimpleRecurrent(activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator(Readout(readout_dim=output_dim, source_names=["states"], emitter=TestEmitter()), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0.0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.tensor3('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 y_test = rng.uniform(size=(n_steps, batch_size, output_dim)).astype(floatX) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = theano.function([y, mask], [costs])(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 115.593, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 3.8531, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join( [generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [ el for el in var_filter(cg.variables) if el.name == aux_var_name ][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 0.38531, rtol=1e-5) # Test 'generate' method states, outputs, costs = [ variable.eval() for variable in generator.generate(states=rng.uniform( size=(batch_size, dim)).astype(floatX), iterate=True, batch_size=batch_size, n_steps=n_steps) ] assert states.shape == (n_steps, batch_size, dim) assert outputs.shape == (n_steps, batch_size, output_dim) assert costs.shape == (n_steps, batch_size) assert_allclose(outputs.sum(), -0.33683, rtol=1e-5) assert_allclose(states.sum(), 15.7909, rtol=1e-5) # There is no generation cost in this case, since generation is # deterministic assert_allclose(costs.sum(), 0.0)
def test_mlp_use_bias_not_pushed_when_not_explicitly_specified(): mlp = MLP(activations=[Tanh(), Tanh(), None], dims=[4, 5, 6, 7], prototype=Linear(use_bias=False)) mlp.push_allocation_config() assert [not lin.use_bias for lin in mlp.linear_transformations]
shuffle_questions = True shuffle_entities = True concat_ctx_and_question = False concat_question_before = False embed_size = 200 ctx_lstm_size = [256, 256] ctx_skip_connections = False question_lstm_size = [256] question_skip_connections = True attention_mlp_hidden = [200] attention_mlp_activations = [Tanh()] step_rule = CompositeRule([RMSProp(decay_rate=0.95, learning_rate=5e-5), BasicMomentum(momentum=0.9)]) dropout = 0.2 w_noise = 0. valid_freq = 10000 save_freq = 10000 print_freq = 1000 weights_init = IsotropicGaussian(0.01) biases_init = Constant(0.) transition_weights_init = Orthogonal()
r_noise_std = 0.01 w_noise_std = 0.00 r_dropout = 0.0 x_dropout = 0.0 s_dropout = 0.0 i_dropout = 0.0 a_dropout = 0.0 s_l1pen = 0.02 i_l1pen = 0.00 a_l1pen = 0.000 ae_dims = [100, 100] ae_f_noise_std = 0.02 ae_l1_pen = 0.01 ae_activations = [Tanh() for _ in ae_dims] center_feats = True normalize_feats = True randomize_feats = False train_on_valid = False hidden_dims = [] activation_functions = [Tanh() for _ in hidden_dims] + [None] hidden_dims_2 = [] activation_functions_2 = [Tanh() for _ in hidden_dims_2] n_inter = 10 inter_bias = None # -5 inter_act_fun = Tanh()
def __init__(self, **kwargs): super(ShallowEnergyComputer, self).__init__( [Tanh().apply, Linear(use_bias=False).apply], **kwargs)
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') better = tensor.imatrix('better') better_mask = tensor.imatrix('better_mask') worse = tensor.imatrix('worse') worse_mask = tensor.imatrix('worse_mask') b_left = tensor.imatrix('b_left') b_left_mask = tensor.imatrix('b_left_mask') b_right = tensor.imatrix('b_right') b_right_mask = tensor.imatrix('b_right_mask') w_left = tensor.imatrix('w_left') w_left_mask = tensor.imatrix('w_left_mask') w_right = tensor.imatrix('w_right') w_right_mask = tensor.imatrix('w_right_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) better = better.dimshuffle(1, 0) better_mask = better_mask.dimshuffle(1, 0) worse = worse.dimshuffle(1, 0) worse_mask = worse_mask.dimshuffle(1, 0) b_left = b_left.dimshuffle(1, 0) b_left_mask = b_left_mask.dimshuffle(1, 0) b_right = b_right.dimshuffle(1, 0) b_right_mask = b_right_mask.dimshuffle(1, 0) w_left = w_left.dimshuffle(1, 0) w_left_mask = w_left_mask.dimshuffle(1, 0) w_right = w_right.dimshuffle(1, 0) w_right_mask = w_right_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # candidate encoders candidates_hidden_list = [] candidate_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_fwd_lstm_in_0_0') candidate_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_fwd_lstm_0') candidate_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_bwd_lstm_in_0_0') candidate_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [candidate_fwd_lstm, candidate_bwd_lstm, candidate_fwd_lstm_ins, candidate_bwd_lstm_ins] #computing better encoding better_embed = embed.apply(better) better_fwd_tmp = candidate_fwd_lstm_ins.apply(better_embed) better_bwd_tmp = candidate_bwd_lstm_ins.apply(better_embed) better_fwd_hidden, _ = candidate_fwd_lstm.apply(better_fwd_tmp, mask=better_mask.astype(theano.config.floatX)) better_bwd_hidden, _ = candidate_bwd_lstm.apply(better_bwd_tmp[::-1], mask=better_mask.astype(theano.config.floatX)[::-1]) better_hidden_list = [better_fwd_hidden, better_bwd_hidden] better_enc_dim = 2*sum(config.ctx_lstm_size) better_enc = tensor.concatenate([h[-1,:,:] for h in better_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_enc.name = 'better_enc' candidates_hidden_list = candidates_hidden_list + [better_fwd_hidden, better_bwd_hidden] #computing worse encoding worse_embed = embed.apply(worse) worse_fwd_tmp = candidate_fwd_lstm_ins.apply(worse_embed) worse_bwd_tmp = candidate_bwd_lstm_ins.apply(worse_embed) worse_fwd_hidden, _ = candidate_fwd_lstm.apply(worse_fwd_tmp, mask=worse_mask.astype(theano.config.floatX)) worse_bwd_hidden, _ = candidate_bwd_lstm.apply(worse_bwd_tmp[::-1], mask=worse_mask.astype(theano.config.floatX)[::-1]) worse_hidden_list = [worse_fwd_hidden, worse_bwd_hidden] worse_enc_dim = 2*sum(config.ctx_lstm_size) worse_enc = tensor.concatenate([h[-1,:,:] for h in worse_hidden_list], axis=1) worse_enc.name = 'worse_enc' candidates_hidden_list = candidates_hidden_list + [worse_fwd_hidden, worse_bwd_hidden] #left encoders left_context_hidden_list = [] left_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_fwd_lstm_in_0_0') left_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_fwd_lstm_0') left_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_bwd_lstm_in_0_0') left_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [left_context_fwd_lstm, left_context_bwd_lstm, left_context_fwd_lstm_ins, left_context_bwd_lstm_ins] #right encoders right_context_hidden_list = [] right_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_fwd_lstm_in_0_0') right_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_fwd_lstm_0') right_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_bwd_lstm_in_0_0') right_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [right_context_fwd_lstm, right_context_bwd_lstm, right_context_fwd_lstm_ins, right_context_bwd_lstm_ins] #left half encodings better_left_embed = embed.apply(b_left) better_left_fwd_tmp = left_context_fwd_lstm_ins.apply(better_left_embed) better_left_bwd_tmp = left_context_bwd_lstm_ins.apply(better_left_embed) better_left_fwd_hidden, _ = left_context_fwd_lstm.apply(better_left_fwd_tmp, mask=b_left_mask.astype(theano.config.floatX)) better_left_bwd_hidden, _ = left_context_bwd_lstm.apply(better_left_bwd_tmp[::-1], mask=b_left_mask.astype(theano.config.floatX)[::-1]) better_left_hidden_list = [better_left_fwd_hidden, better_left_bwd_hidden] better_left_enc_dim = 2*sum(config.ctx_lstm_size) better_left_enc = tensor.concatenate([h[-1,:,:] for h in better_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_left_enc.name = 'better_left_enc' left_context_hidden_list = left_context_hidden_list + [better_left_fwd_hidden, better_left_bwd_hidden] worse_left_embed = embed.apply(w_left) worse_left_fwd_tmp = left_context_fwd_lstm_ins.apply(worse_left_embed) worse_left_bwd_tmp = left_context_bwd_lstm_ins.apply(worse_left_embed) worse_left_fwd_hidden, _ = left_context_fwd_lstm.apply(worse_left_fwd_tmp, mask=w_left_mask.astype(theano.config.floatX)) worse_left_bwd_hidden, _ = left_context_bwd_lstm.apply(worse_left_bwd_tmp[::-1], mask=w_left_mask.astype(theano.config.floatX)[::-1]) worse_left_hidden_list = [worse_left_fwd_hidden, worse_left_bwd_hidden] worse_left_enc_dim = 2*sum(config.ctx_lstm_size) worse_left_enc = tensor.concatenate([h[-1,:,:] for h in worse_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_left_enc.name = 'worse_left_enc' left_context_hidden_list = left_context_hidden_list + [worse_left_fwd_hidden, worse_left_bwd_hidden] #right half encoding better_right_embed = embed.apply(b_right) better_right_fwd_tmp = right_context_fwd_lstm_ins.apply(better_right_embed) better_right_bwd_tmp = right_context_bwd_lstm_ins.apply(better_right_embed) better_right_fwd_hidden, _ = right_context_fwd_lstm.apply(better_right_fwd_tmp, mask=b_right_mask.astype(theano.config.floatX)) better_right_bwd_hidden, _ = right_context_bwd_lstm.apply(better_right_bwd_tmp[::-1], mask=b_right_mask.astype(theano.config.floatX)[::-1]) better_right_hidden_list = [better_right_fwd_hidden, better_right_bwd_hidden] better_right_enc_dim = 2*sum(config.ctx_lstm_size) better_right_enc = tensor.concatenate([h[-1,:,:] for h in better_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_right_enc.name = 'better_right_enc' right_context_hidden_list = right_context_hidden_list + [better_right_fwd_hidden, better_right_bwd_hidden] worse_right_embed = embed.apply(w_right) worse_right_fwd_tmp = right_context_fwd_lstm_ins.apply(worse_right_embed) worse_right_bwd_tmp = right_context_bwd_lstm_ins.apply(worse_right_embed) worse_right_fwd_hidden, _ = right_context_fwd_lstm.apply(worse_right_fwd_tmp, mask=w_right_mask.astype(theano.config.floatX)) worse_right_bwd_hidden, _ = right_context_bwd_lstm.apply(worse_right_bwd_tmp[::-1], mask=w_right_mask.astype(theano.config.floatX)[::-1]) worse_right_hidden_list = [worse_right_fwd_hidden, worse_right_bwd_hidden] worse_right_enc_dim = 2*sum(config.ctx_lstm_size) worse_right_enc = tensor.concatenate([h[-1,:,:] for h in worse_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_right_enc.name = 'worse_right_enc' right_context_hidden_list = right_context_hidden_list + [worse_right_fwd_hidden, worse_right_bwd_hidden] # F1 prediction MLP prediction_mlp = MLP(dims=config.prediction_mlp_hidden + [1], activations=config.prediction_mlp_activations[1:] + [Identity()], name='prediction_mlp') prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, name='preq') prediction_cand_linear = Linear(input_dim=worse_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='precand') prediction_left_half_linear = Linear(input_dim=better_left_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preleft') prediction_right_half_linear = Linear(input_dim=better_right_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preright') bricks += [prediction_mlp, prediction_qlinear, prediction_cand_linear, prediction_left_half_linear, prediction_right_half_linear] better_layer1 = Tanh('tan1').apply(tensor.concatenate([prediction_cand_linear.apply(better_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(better_left_enc), prediction_right_half_linear.apply(better_right_enc)],axis=1)) better_layer1.name = 'better_layer1' worse_layer1 = Tanh('tan2').apply(tensor.concatenate([prediction_cand_linear.apply(worse_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(worse_left_enc), prediction_right_half_linear.apply(worse_right_enc)],axis=1)) worse_layer1.name = 'worse_layer1' better_pred_weights = Tanh('rec1').apply(prediction_mlp.apply(better_layer1)) #batch_size worse_pred_weights = Tanh('rec2').apply(prediction_mlp.apply(worse_layer1)) #batch_size # numpy.set_printoptions(edgeitems=500) # better_pred_weights = theano.printing.Print('better')(better_pred_weights) # worse_pred_weights = theano.printing.Print('better')(worse_pred_weights) # #cost : max(0,- score-better + score-worse + margin) margin = config.margin conditions = tensor.lt(better_pred_weights, worse_pred_weights + margin).astype(theano.config.floatX) self.predictions = conditions cost = (-better_pred_weights + worse_pred_weights + margin) * conditions cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + candidates_hidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def build_model_soft(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] # Build the MLP dims = [2 * state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
conv_eeg = Convolutional(filter_size=(300, 1), num_filters=20, num_channels=1, border_mode='full', tied_biases=True, name="conv_eeg") maxpool_eeg = MaxPooling(pooling_size=(5, 1), name='maxpool_eeg') # convolve eeg1 = conv_eeg.apply(eeg) # cut borders d1 = (eeg1.shape[2] - eeg.shape[2])/2 eeg1 = eeg1[:, :, d1:d1+eeg.shape[2], :] # subsample eeg1 = maxpool_eeg.apply(eeg1) # activation eeg1 = Tanh(name='act_eeg').apply(eeg1) # second convolution only on eeg conv_eeg2 = Convolutional(filter_size=(100, 1), num_filters=40, num_channels=20, border_mode='full', tied_biases=True, name="conv_eeg2") maxpool_eeg2 = MaxPooling(pooling_size=(5, 1), name='maxpool_eeg2') # convolve eeg2 = conv_eeg2.apply(eeg1) # cut borders d1 = (eeg2.shape[2] - eeg1.shape[2])/2 eeg2 = eeg2[:, :, d1:d1+eeg1.shape[2], :] # subsample
w_noise_std = 0.05 r_dropout = 0.0 s_dropout = 0.0 i_dropout = 0.0 a_dropout = 0.0 center_feats = True normalize_feats = True randomize_feats = False train_on_valid = False reconstruction_penalty = 1 hidden_dims_0 = [5] activation_functions_0 = [Tanh() for _ in hidden_dims_0] hidden_dims_1 = [] activation_functions_1 = [Tanh() for _ in hidden_dims_1] + [None] hidden_dims_2 = [] activation_functions_2 = [Tanh() for _ in hidden_dims_2] n_inter = 2 inter_act_fun = Tanh() dataset = 'ARCENE' pt_freq = 10 param_desc = '%s-%s%s,%d,%s-n%s-d%s,%s,%s,%s-p%s-%s-%s' % ( dataset, repr(hidden_dims_0), repr(hidden_dims_1), n_inter, repr(hidden_dims_2), repr(w_noise_std), repr(r_dropout), repr(s_dropout), repr(i_dropout), repr(a_dropout), repr(reconstruction_penalty),
def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') if self.n_layers >= 1: self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [ self.lookup, self.bidir, self.fwd_fork, self.back_fork ] if self.n_layers > 1: # Deep encoder self.mid_fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='mid_fwd_fork') self.mid_back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='mid_back_fork') self.children.append(self.mid_fwd_fork) self.children.append(self.mid_back_fork) elif self.n_layers == 0: self.embedding_dim = state_dim * 2 self.children = [self.lookup] else: logging.fatal("Number of encoder layers must be non-negative")
def main(): x = T.tensor3('features') #m = T.matrix('features_mask') y = T.imatrix('targets') #x = x+m.mean()*0 embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" wstd = 0.02 #vaguely normalize x = x / 3.0 - .5 #gloveMapping = Linear( #input_dim = embedding_size, #output_dim = 128, #weights_init = Orthogonal(), #biases_init = Constant(0.0), #name="gloveMapping" #) #gloveMapping.initialize() #o = gloveMapping.apply(x) #o = Rectifier(name="gloveRec").apply(o) o = x input_dim = 300 gru = GatedRecurrentFull( hidden_dim=input_dim, activation=Tanh(), #activation=bricks.Identity(), gate_activation=Sigmoid(), state_to_state_init=IsotropicGaussian(0.02), state_to_reset_init=IsotropicGaussian(0.02), state_to_update_init=IsotropicGaussian(0.02), input_to_state_transform=Linear(input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0)), input_to_update_transform=Linear(input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0)), input_to_reset_transform=Linear(input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0))) gru.initialize() rnn_in = o.dimshuffle(1, 0, 2) #rnn_in = o #rnn_out = gru.apply(rnn_in, mask=m.T) rnn_out = gru.apply(rnn_in) state_to_state = gru.rnn.state_to_state state_to_state.name = "state_to_state" #o = rnn_out[-1, :, :] o = rnn_out[-1] #o = rnn_out[:, -1, :] #o = rnn_out.mean(axis=1) #print rnn_last_out.eval({ #x: np.ones((3, 101, 300), dtype=theano.config.floatX), #m: np.ones((3, 101), dtype=theano.config.floatX)}) #raw_input() #o = rnn_out.mean(axis=1) score_layer = Linear(input_dim=300, output_dim=1, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.), use_bias=True, name="linear_score") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print rnn_in.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #}) #print rnn_out.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) #cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5) params = cg.parameters for p in params: p.name += "___" + p.tag.annotations[0].name algorithm = GradientDescent( cost=cg.outputs[0], params=params, step_rule=CompositeRule([ StepClipping(threshold=4), AdaM(), #NAG(lr=0.1, momentum=0.9), #AdaDelta(), ])) #algorithm.initialize() print params f = theano.function([x, y], algorithm.cost) ipdb.set_trace() print "making plots" #theano.printing.pydotprint(algorithm.cost, outfile='unopt.png') theano.printing.pydotprint(f, outfile='opt.png', scan_graphs=True)
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: #default yes cenc_dim = 2*sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP activation: Tanh, identity attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') #Wum attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') # Wym bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' print("attended shape: %d" %attended.shape) dimension = qenc_dim + cenc_dim transition = SimpleRecurrent(activation=Tanh(),dim=dimension, name="transition") readout = Readout( readout_dim=vocab_size, source_names=[transition.apply.states[0]], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(vocab_size, dimension), name="readout") generator = SequenceGenerator( readout=readout, transition=transition, name="generator") self.generator = generator bricks += [generator] cost = self.generator.cost() # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate pred = probs.argmax(axis=1) cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__( self, recordings_source, labels_source, eos_label, num_features, num_phonemes, dim_dec, dims_bidir, dims_bottom, enc_transition, dec_transition, use_states_for_readout, attention_type, lm=None, character_map=None, subsample=None, dims_top=None, prior=None, conv_n=None, bottom_activation=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, energy_normalizer=None, # softmax is th edefault set in SequenceContentAndConvAttention **kwargs): if bottom_activation is None: bottom_activation = Tanh() if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.recordings_source = recordings_source self.labels_source = labels_source self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack bottom_activation = bottom_activation post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN if dims_bottom: bottom = MLP([bottom_activation] * len(dims_bottom), [num_features] + dims_bottom, name="bottom") else: bottom = Identity(name='bottom') # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder( self.enc_transition, dims_bidir, dims_bottom[-1] if len(dims_bottom) else num_features, subsample) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec) else: feedback = OneOfNFeedback(num_phonemes + 1) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() else: emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") readout_config = dict(readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm: lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.recordings = tensor.tensor3(self.recordings_source) self.recordings_mask = tensor.matrix(self.recordings_source + "_mask") self.labels = tensor.lmatrix(self.labels_source) self.labels_mask = tensor.matrix(self.labels_source + "_mask") self.batch_inputs = [ self.recordings, self.recordings_source, self.labels, self.labels_mask ] self.single_recording = tensor.matrix(self.recordings_source) self.single_transcription = tensor.lvector(self.labels_source)