def build_model_vanilla(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [ SimpleRecurrent(dim=args.state_dim, activation=Tanh()) for _ in range(args.layers) ] rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # We have # h = [state, state_1, state_2 ...] if args.layers > 1 # h = state if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) if args.skip_connections or args.skip_output: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: # TODO correct bug # hidden_states.append(h * x_mask) hidden_states.append(h) hidden_states[0].name = "hidden_state_0" # Note: if we have mask, then updating initial state # with last state does not make sence anymore. last_states[0] = h[-1, :, :] # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(Decimator, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth # representation self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) # importance of this representation self.bidir_w = Bidirectional(RecurrentWithFork( DGRU(activation=Tanh(), dim=self.dgru_state_dim // 2), self.embedding_dim, name='src_word_with_fork'), name='bidir_src_word_encoder') self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') # map to a energy scalar self.wl = Linear(input_dim=dgru_state_dim, output_dim=1) self.children = [ self.lookup, self.dgru, self.gru_fork, self.bidir_w, self.wl ]
def build_model_vanilla(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [SimpleRecurrent(dim=args.state_dim, activation=Tanh()) for _ in range(args.layers)] rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # We have # h = [state, state_1, state_2 ...] if args.layers > 1 # h = state if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) if args.skip_connections or args.skip_output: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: # TODO correct bug # hidden_states.append(h * x_mask) hidden_states.append(h) hidden_states[0].name = "hidden_state_0" # Note: if we have mask, then updating initial state # with last state does not make sence anymore. last_states[0] = h[-1, :, :] # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
def __init__(self, vocab_size, embedding_dim, igru_state_dim, igru_depth, trg_dgru_depth, emitter, feedback_brick, merge=None, merge_prototype=None, post_merge=None, **kwargs): merged_dim = igru_state_dim if not merge: merge = Merge(input_names=kwargs['source_names'], prototype=merge_prototype) if not post_merge: post_merge = Bias(dim=merged_dim) # for compatible if igru_depth == 1: self.igru = IGRU(dim=igru_state_dim) else: self.igru = RecurrentStack( [IGRU(dim=igru_state_dim, name='igru')] + [ UpperIGRU(dim=igru_state_dim, activation=Tanh(), name='upper_igru' + str(i)) for i in range(1, igru_depth) ], skip_connections=True) self.embedding_dim = embedding_dim self.emitter = emitter self.feedback_brick = feedback_brick self.merge = merge self.post_merge = post_merge self.merged_dim = merged_dim self.igru_depth = igru_depth self.trg_dgru_depth = trg_dgru_depth self.lookup = LookupTable(name='embeddings') self.vocab_size = vocab_size self.igru_state_dim = igru_state_dim self.gru_to_softmax = Linear(input_dim=igru_state_dim, output_dim=vocab_size) self.gru_fork = Fork([ name for name in self.igru.apply.sequences if name != 'mask' and name != 'input_states' ], prototype=Linear(), name='gru_fork') children = [ self.emitter, self.feedback_brick, self.merge, self.post_merge, self.igru, self.lookup, self.gru_to_softmax, self.gru_fork ] kwargs.setdefault('children', []).extend(children) super(Interpolator, self).__init__(**kwargs)
def setUp(self): depth = 4 self.depth = depth dim = 3 # don't change, hardwired in the code transitions = [LSTM(dim=dim) for _ in range(depth)] self.stack0 = RecurrentStack(transitions, weights_init=Constant(2), biases_init=Constant(0)) self.stack0.initialize() self.stack2 = RecurrentStack(transitions, weights_init=Constant(2), biases_init=Constant(0), skip_connections=True) self.stack2.initialize()
def __init__(self,hidden_size_recurrent, k, **kwargs): super(Scribe, self).__init__(**kwargs) readout_size =6*k+1 transition = [GatedRecurrent(dim=hidden_size_recurrent, name = "gru_{}".format(i) ) for i in range(3)] transition = RecurrentStack( transition, name="transition", skip_connections = True) emitter = BivariateGMMEmitter(k = k) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout( readout_dim = readout_size, source_names =source_names, emitter=emitter, name="readout") self.generator = SequenceGenerator(readout=readout, transition=transition, name = "generator") self.children = [self.generator]
def test_split_suffix(self): # generate some numbers level1, level2 = numpy.random.randint(1, 150, size=(2,)) name1 = "somepart" # test cases like (<given_name>, <expected_name>, <expected_level>) # name, level, level2 and sep will be provided test_cases = [ # case layer == 0 ("{name}", "{name}", 0), # case empty name part ("{sep}{level}", "", level1), # normal case ("{name}{sep}{level}","{name}",level1), # case nested recurrent stacks ("{name}{sep}{level}{sep}{level2}","{name}{sep}{level}", level2), # some more edge cases... ("{sep}{name}{sep}{level}", "{sep}{name}", level1), ("{name}{sep}","{name}{sep}", 0), ("{name}{sep}{name}","{name}{sep}{name}", 0), ("{name}{sep}{level}{sep}{name}", "{name}{sep}{level}{sep}{name}", 0) ] # check all test cases for _name, _expected_name_part, expected_level in test_cases: # fill in aktual details like the currend RECURRENTSTACK_SEPARATOR name = _name.format(name=name1, level=level1, level2=level2, sep=RECURRENTSTACK_SEPARATOR) expected_name_part = _expected_name_part.format(name=name1, level=level1, level2=level2, sep=RECURRENTSTACK_SEPARATOR) name_part, level = RecurrentStack.split_suffix(name) assert name_part == expected_name_part and level == expected_level, \ "expected split_suffex(\"{}\") -> name(\"{}\"), level({}) got name(\"{}\"), level({})".format( name, expected_name_part, expected_level, name_part, level)
def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3) ] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones( (24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0
def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(TargetWordEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') self.children = [self.lookup, self.dgru, self.gru_fork]
def test_split_suffix(self): # generate some numbers level1, level2 = numpy.random.randint(1, 150, size=(2, )) name1 = "somepart" # test cases like (<given_name>, <expected_name>, <expected_level>) # name, level, level2 and sep will be provided test_cases = [ # case layer == 0 ("{name}", "{name}", 0), # case empty name part ("{sep}{level}", "", level1), # normal case ("{name}{sep}{level}", "{name}", level1), # case nested recurrent stacks ("{name}{sep}{level}{sep}{level2}", "{name}{sep}{level}", level2), # some more edge cases... ("{sep}{name}{sep}{level}", "{sep}{name}", level1), ("{name}{sep}", "{name}{sep}", 0), ("{name}{sep}{name}", "{name}{sep}{name}", 0), ("{name}{sep}{level}{sep}{name}", "{name}{sep}{level}{sep}{name}", 0) ] # check all test cases for _name, _expected_name_part, expected_level in test_cases: # fill in aktual details like the currend RECURRENTSTACK_SEPARATOR name = _name.format(name=name1, level=level1, level2=level2, sep=RECURRENTSTACK_SEPARATOR) expected_name_part = _expected_name_part.format( name=name1, level=level1, level2=level2, sep=RECURRENTSTACK_SEPARATOR) name_part, level = RecurrentStack.split_suffix(name) condition = (name_part == expected_name_part and level == expected_level) assert condition, "expected split_suffex(\"{}\") " \ "-> name(\"{}\"), level({}) got " \ "name(\"{}\"), level({})".format( name, expected_name_part, expected_level, name_part, level)
def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3)] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0
def test_suffix(self): # level >= 0 !! level1, = numpy.random.randint(1, 150, size=(1, )) # name1 != "mask" !! name1 = "somepart" test_cases = [("mask", level1, "mask"), ("{name}", 0, "{name}"), ("{name}", level1, "{name}{sep}{level}")] for _name, level, _expected_result in test_cases: name = _name.format(name=name1, level=level1, sep=RECURRENTSTACK_SEPARATOR) expected_result = _expected_result.format( name=name1, level=level1, sep=RECURRENTSTACK_SEPARATOR) resut = RecurrentStack.suffix(name, level) assert resut == expected_result, "expected suffix(\"{}\",{}) -> \"{}\" got \"{}\"".format( name, level, expected_result, resut)
def test_suffix(self): # level >= 0 !! level1, = numpy.random.randint(1, 150, size=(1,)) # name1 != "mask" !! name1 = "somepart" test_cases = [ ("mask", level1, "mask"), ("{name}", 0, "{name}"), ("{name}", level1, "{name}{sep}{level}") ] for _name, level, _expected_result in test_cases: name = _name.format(name=name1, level=level1, sep=RECURRENTSTACK_SEPARATOR) expected_result = _expected_result.format(name=name1, level=level1, sep=RECURRENTSTACK_SEPARATOR) resut = RecurrentStack.suffix(name, level) assert resut == expected_result, "expected suffix(\"{}\",{}) -> \"{}\" got \"{}\"".format(name, level, expected_result, resut)
class TestBidirectionalStack(unittest.TestCase): def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3) ] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones( (24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0 def test_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_stack_layers = [ theano.function([x, mask], self.stack.apply(x, mask=mask)[i]) for i in range(len(self.layers)) ] stack_layers = [ f(self.x_val, self.mask_val) for f in calc_stack_layers ] h_val = self.x_val for stack_layer_value, bidir_net in zip(stack_layers, self.layers): calc = theano.function([x, mask], bidir_net.apply(x, mask=mask)) simple_layer_value = calc(h_val, self.mask_val) assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04) h_val = simple_layer_value[..., :3] def test_dims(self): self.assertEqual(self.stack.get_dim("inputs"), 3) for i in range(len(self.layers)): state_name = self.stack.suffix("states", i) self.assertEqual(self.stack.get_dim(state_name), 6)
class TestBidirectionalStack(unittest.TestCase): def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3)] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0 def test_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_stack_layers = [ theano.function([x, mask], self.stack.apply(x, mask=mask)[i]) for i in range(len(self.layers))] stack_layers = [ f(self.x_val, self.mask_val) for f in calc_stack_layers] h_val = self.x_val for stack_layer_value, bidir_net in zip(stack_layers, self.layers): calc = theano.function([x, mask], bidir_net.apply(x, mask=mask)) simple_layer_value = calc(h_val, self.mask_val) assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04) h_val = simple_layer_value[..., :3] def test_dims(self): self.assertEqual(self.stack.get_dim("inputs"), 3) for i in range(len(self.layers)): state_name = self.stack.suffix("states", i) self.assertEqual(self.stack.get_dim(state_name), 6)
def build_model_lstm(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [LSTM(dim=args.state_dim, activation=Tanh()) for _ in range(args.layers)] rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(mask=x_mask, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} hidden_states = [] for d in range(args.layers): # TODO correct bug # h[5 * d] = h[5 * d] * x_mask # h[5 * d + 1] = h[5 * d + 1] * x_mask last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] h[5 * d].name = "hidden_state_" + str(d) h[5 * d + 1].name = "hidden_cell_" + str(d) hidden_states.extend([h[5 * d], h[5 * d + 1]]) # The updates of the hidden states # Note: if we have mask, then updating initial state # with last state does not make sence anymore. updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) updates.append((inits[1][d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = {"in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates} h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if args.layers > 1 # h = [state] if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if args.layers > 1: if args.skip_connections or args.skip_output: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state_all" presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, gate_values, hidden_states
def build_model_cw(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper if args.module_order == "fast_in_slow": transitions = [ClockworkBase( dim=args.state_dim, activation=Tanh(), period=2 ** i) for i in range(args.layers)] elif args.module_order == "slow_in_fast": transitions = [ClockworkBase( dim=args.state_dim, activation=Tanh(), period=2 ** (args.layers - i - 1)) for i in range(args.layers)] else: assert False rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # In the Clockwork case: # h = [state, time, state_1, time_1 ...] h = h[::2] # Now we have correctly: # h = [state, state_1, state_2 ...] if args.layers > 1 # h = [state] if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct the bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) h = tensor.concatenate(h, axis=2) else: h = h[0] * x_mask last_states[0] = h[-1, :, :] h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
def __init__(self, batch_size, frame_size, k, depth, size, **kwargs): super(PyramidLayer, self).__init__(**kwargs) target_size = frame_size * k depth_x = depth hidden_size_mlp_x = 32*size depth_transition = depth-1 depth_theta = depth hidden_size_mlp_theta = 32*size hidden_size_recurrent = 32*size*3 depth_context = depth hidden_size_mlp_context = 32*size context_size = 32*size activations_x = [Rectifier()]*depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [4*hidden_size_recurrent] activations_theta = [Rectifier()]*depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta activations_context = [Rectifier()]*depth_context dims_context = [frame_size] + [hidden_size_mlp_context]*(depth_context-1) + \ [context_size] mlp_x = MLP(activations = activations_x, dims = dims_x, name = "mlp_x") feedback = DeepTransitionFeedback(mlp = mlp_x) transition = [GatedRecurrent(dim=hidden_size_recurrent, use_bias = True, name = "gru_{}".format(i) ) for i in range(depth_transition)] transition = RecurrentStack( transition, name="transition", skip_connections = True) self.transition = transition mlp_theta = MLP( activations = activations_theta, dims = dims_theta, name = "mlp_theta") mlp_gmm = GMMMLP(mlp = mlp_theta, dim = target_size, k = k, const = 0.00001, name = "gmm_wrap") gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) source_names = [name for name in transition.apply.states if 'states' in name] attention = SimpleSequenceAttention( state_names = source_names, state_dims = [hidden_size_recurrent], attended_dim = context_size, name = "attention") #ipdb.set_trace() # Verify source names readout = Readout( readout_dim = hidden_size_recurrent, source_names =source_names + ['feedback'] + ['glimpses'], emitter=gmm_emitter, feedback_brick = feedback, name="readout") self.generator = SequenceGenerator(readout=readout, transition=transition, attention = attention, name = "generator") self.mlp_context = MLP(activations = activations_context, dims = dims_context) self.children = [self.generator, self.mlp_context] self.final_states = []
def build_fork_lookup(vocab_size, time_length, args): x = tensor.lmatrix('features') virtual_dim = 6 state_dim = 6 skip_connections = False layers = 1 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper transitions = [ClockworkBase(dim=state_dim, activation=Tanh(), period=2 ** i) for i in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) # Give time as the first index for each element in the list: # (Time X Batch X embedding_dim) if layers > 1 and skip_connections: for t in range(len(pre_rnn)): pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2) else: pre_rnn = pre_rnn.dimshuffle(1, 0, 2) f_pre_rnn = theano.function([x], pre_rnn) # Prepare inputs for the RNN kwargs = OrderedDict() for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] else: kwargs['inputs' + suffix] = pre_rnn print kwargs # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() f_h = theano.function([x], h) return f_pre_rnn, f_h
class SimplePyramidLayer(Initializable): """Basic unit for the pyramid model. """ def __init__(self, batch_size, frame_size, k, depth, size, **kwargs): super(SimplePyramidLayer, self).__init__(**kwargs) target_size = frame_size * k depth_x = depth hidden_size_mlp_x = 32*size depth_transition = depth-1 depth_theta = depth hidden_size_mlp_theta = 32*size hidden_size_recurrent = 32*size*3 activations_x = [Rectifier()]*depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [4*hidden_size_recurrent] activations_theta = [Rectifier()]*depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta self.mlp_x = MLP(activations = activations_x, dims = dims_x, name = "mlp_x") transition = [GatedRecurrent(dim=hidden_size_recurrent, use_bias = True, name = "gru_{}".format(i) ) for i in range(depth_transition)] self.transition = RecurrentStack( transition, name="transition", skip_connections = True) mlp_theta = MLP( activations = activations_theta, dims = dims_theta, name = "mlp_theta") mlp_gmm = GMMMLP(mlp = mlp_theta, dim = target_size, k = k, const = 0.00001, name = "gmm_wrap") self.gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) normal_inputs = [name for name in self.transition.apply.sequences if 'mask' not in name] self.fork = Fork(normal_inputs, input_dim = 4*hidden_size_recurrent, output_dims = self.transition.get_dims(normal_inputs)) self.children = [self.mlp_x, self.transition, self.gmm_emitter, self.fork] def monitoring_vars(self, cg): mu, sigma, coeff = VariableFilter( applications = [self.gmm_emitter.gmmmlp.apply], name_regex = "output")(cg.variables) min_sigma = sigma.min().copy(name="sigma_min") mean_sigma = sigma.mean().copy(name="sigma_mean") max_sigma = sigma.max().copy(name="sigma_max") min_mu = mu.min().copy(name="mu_min") mean_mu = mu.mean().copy(name="mu_mean") max_mu = mu.max().copy(name="mu_max") monitoring_vars = [mean_sigma, min_sigma, min_mu, max_mu, mean_mu, max_sigma] return monitoring_vars @application def cost(self, x, context, **kwargs): x_g = self.mlp_x.apply(context) inputs = self.fork.apply(x_g, as_dict = True) h = self.transition.apply(**dict_union(inputs, kwargs)) self.final_states = [] for var in h: self.final_states.append(var[-1].copy(name = var.name + "_final_value")) cost = self.gmm_emitter.cost(h[-1], x) return cost.mean() @application def generate(context): x_g = self.mlp_x.apply(context) inputs = self.fork.apply(x_g, as_dict = True) h = self.transition.apply(**dict_union(inputs, kwargs)) return self.gmm_emitter.emit(h[-1])
def __init__(self, batch_size, frame_size, k, depth, size, **kwargs): super(SimplePyramidLayer, self).__init__(**kwargs) target_size = frame_size * k depth_x = depth hidden_size_mlp_x = 32*size depth_transition = depth-1 depth_theta = depth hidden_size_mlp_theta = 32*size hidden_size_recurrent = 32*size*3 activations_x = [Rectifier()]*depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [4*hidden_size_recurrent] activations_theta = [Rectifier()]*depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta self.mlp_x = MLP(activations = activations_x, dims = dims_x, name = "mlp_x") transition = [GatedRecurrent(dim=hidden_size_recurrent, use_bias = True, name = "gru_{}".format(i) ) for i in range(depth_transition)] self.transition = RecurrentStack( transition, name="transition", skip_connections = True) mlp_theta = MLP( activations = activations_theta, dims = dims_theta, name = "mlp_theta") mlp_gmm = GMMMLP(mlp = mlp_theta, dim = target_size, k = k, const = 0.00001, name = "gmm_wrap") self.gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) normal_inputs = [name for name in self.transition.apply.sequences if 'mask' not in name] self.fork = Fork(normal_inputs, input_dim = 4*hidden_size_recurrent, output_dims = self.transition.get_dims(normal_inputs)) self.children = [self.mlp_x, self.transition, self.gmm_emitter, self.fork]
def build_model_cw(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper if args.module_order == "fast_in_slow": transitions = [ ClockworkBase(dim=args.state_dim, activation=Tanh(), period=2**i) for i in range(args.layers) ] elif args.module_order == "slow_in_fast": transitions = [ ClockworkBase(dim=args.state_dim, activation=Tanh(), period=2**(args.layers - i - 1)) for i in range(args.layers) ] else: assert False rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # In the Clockwork case: # h = [state, time, state_1, time_1 ...] h = h[::2] # Now we have correctly: # h = [state, state_1, state_2 ...] if args.layers > 1 # h = [state] if args.layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} hidden_states = [] if args.layers > 1: # Save all the last states for d in range(args.layers): # TODO correct the bug # h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) h = tensor.concatenate(h, axis=2) else: h = h[0] * x_mask last_states[0] = h[-1, :, :] h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, unregularized_cost = get_costs(presoft, args) return cost, unregularized_cost, updates, hidden_states
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout, depth, max_grad, step_method, epsilon, sample, skip, uniform, top): #---------------------------------------------------------------------- datasource = name def shnum(x): """ Convert a positive float into a short tag-usable string E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2 """ return '0' if x <= 0 else '%s%d' % (("%e"%x)[0], -np.floor(np.log10(x))) jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (datasource, depth, dim, mix_dim, int(dropout*10), shnum(learning_rate), batch_size, shnum(epsilon)) if max_length != 600: jobname += '-L%d'%max_length if GRU: jobname += 'g' if max_grad != 5.: jobname += 'G%g'%max_grad if step_method != 'adam': jobname += step_method if skip: jobname += 'D' assert depth > 1 if top: jobname += 'T' assert depth > 1 if uniform > 0.: jobname += 'u%d'%int(uniform*100) if debug: jobname += ".debug" if sample: print("Sampling") else: print("\nRunning experiment %s" % jobname) if old_model_name: print("starting from model %s"%old_model_name) #---------------------------------------------------------------------- transitions = [GatedRecurrent(dim=dim) if GRU else LSTM(dim=dim) for _ in range(depth)] if depth > 1: transition = RecurrentStack(transitions, name="transition", skip_connections=skip or top) if skip: source_names=[RecurrentStack.suffix('states', d) for d in range(depth)] else: source_names=[RecurrentStack.suffix('states', depth-1)] else: transition = transitions[0] transition.name = "transition" source_names=['states'] emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter") readout = Readout( readout_dim=emitter.get_dim('inputs'), source_names=source_names, emitter=emitter, name="readout") generator = SequenceGenerator(readout=readout, transition=transition) # Initialization settings if uniform > 0.: generator.weights_init = Uniform(width=uniform*2.) else: generator.weights_init = OrthogonalGlorot() generator.biases_init = Constant(0) # Build the cost computation graph [steps, batch_size, 3] x = T.tensor3('features', dtype=floatX) if debug: x.tag.test_value = np.ones((max_length,batch_size,3)).astype(floatX) x = x[:max_length,:,:] # has to be after setting test_value cost = generator.cost(x) cost.name = "sequence_log_likelihood" # Give an idea of what's going on model = Model(cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) model_size = 0 for v in params.itervalues(): s = v.get_value().shape model_size += s[0] * (s[1] if len(s) > 1 else 1) logger.info("Total number of parameters %d"%model_size) #------------------------------------------------------------ extensions = [] if old_model_name: if old_model_name == 'continue': old_model_name = jobname with open(old_model_name + '_model', "rb") as f: old_model = pickle.load(f) model.set_parameter_values(old_model.get_parameter_values()) del old_model else: # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() if sample: assert old_model_name and old_model_name != 'continue' Sample(generator, steps=max_length, path=old_model_name).do(None) exit(0) #------------------------------------------------------------ # Define the training algorithm. cg = ComputationGraph(cost) if dropout > 0.: from blocks.roles import INPUT, OUTPUT dropout_target = VariableFilter(roles=[OUTPUT], bricks=transitions, name_regex='states')(cg.variables) print('# dropout %d' % len(dropout_target)) cg = apply_dropout(cg, dropout_target, dropout) opt_cost = cg.outputs[0] else: opt_cost = cost if step_method == 'adam': step_rule = Adam(learning_rate) elif step_method == 'rmsprop': step_rule = RMSProp(learning_rate, decay_rate=0.95) elif step_method == 'adagrad': step_rule = AdaGrad(learning_rate) elif step_method == 'adadelta': step_rule = AdaDelta() elif step_method == 'scale': step_rule = Scale(learning_rate) else: raise Exception('Unknown sttep method %s'%step_method) step_rule = CompositeRule([StepClipping(max_grad), step_rule]) algorithm = GradientDescent( cost=opt_cost, parameters=cg.parameters, step_rule=step_rule) #------------------------------------------------------------ observables = [cost] # Fetch variables useful for debugging (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") observables += [min_energy, max_energy] # (activations,) = VariableFilter( # applications=[generator.transition.apply], # name=generator.transition.apply.states[0])(cg.variables) # mean_activation = named_copy(abs(activations).mean(), # "mean_activation") # observables.append(mean_activation) observables += [algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(param.norm(2).copy( name=name + "_norm")) observables.append(algorithm.gradients[param].norm(2).copy( name=name + "_grad_norm")) #------------------------------------------------------------ datasource_fname = os.path.join(fuel.config.data_path[0], datasource, datasource+'.hdf5') train_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_sets=['train'], sources=('features',), load_in_memory=True) train_stream = DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size)) test_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_sets=['test'], sources=('features',), load_in_memory=True) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) train_stream = Mapping(train_stream, _transpose) test_stream = Mapping(test_stream, _transpose) def stream_stats(ds, label): itr = ds.get_epoch_iterator(as_dict=True) batch_count = 0 examples_count = 0 for batch in itr: batch_count += 1 examples_count += batch['features'].shape[1] print('%s #batch %d #examples %d' % (label, batch_count, examples_count)) stream_stats(train_stream, 'train') stream_stats(test_stream, 'test') extensions += [Timing(every_n_batches=10), TrainingDataMonitoring( observables, prefix="train", every_n_batches=10), DataStreamMonitoring( [cost], # without dropout test_stream, prefix="test", on_resumption=True, after_epoch=False, # by default this is True every_n_batches=100), # all monitored data is ready so print it... # (next steps may take more time and we want to see the # results as soon as possible so print as soon as you can) Printing(every_n_batches=10), # perform multiple dumps at different intervals # so if one of them breaks (has nan) we can hopefully # find a model from few batches ago in the other Checkpoint(jobname, before_training=False, after_epoch=True, save_separately=['log', 'model']), Sample(generator, steps=max_length, path=jobname+'.test', every_n_batches=100), ProgressBar(), FinishAfter(after_n_epochs=epochs) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), ] if bokeh: from blocks.extras.extensions.plot import Plot extensions.append(Plot( 'sketch', channels=[['cost']], every_n_batches=10)) # Construct the main loop and start training! main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()
class Decimator(Initializable): """Source word encoder, mapping a charater-level word to a vector. This encoder is able to learn the morphology. For compatibility with previous version, we call it Decimator. """ def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(Decimator, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth # representation self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) # importance of this representation self.bidir_w = Bidirectional(RecurrentWithFork( DGRU(activation=Tanh(), dim=self.dgru_state_dim // 2), self.embedding_dim, name='src_word_with_fork'), name='bidir_src_word_encoder') self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') # map to a energy scalar self.wl = Linear(input_dim=dgru_state_dim, output_dim=1) self.children = [ self.lookup, self.dgru, self.gru_fork, self.bidir_w, self.wl ] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation', 'weight']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) wgru_out = tensor.exp( self.wl.apply(self.bidir_w.apply(embeddings, char_aux))) if self.dgru_depth > 1: gru_out = gru_out[-1] gru_out = tensor.addbroadcast(wgru_out, 2) * gru_out sampled_representation = tensor.tanh( tensor.batched_dot(sample_matrix, gru_out.dimshuffle([1, 0, 2]))) return sampled_representation.dimshuffle([1, 0, 2]), wgru_out def get_dim(self, name): if name == 'output': return self.dgru_state_dim super(Decimator, self).get_dim(name)
def __init__( self, input_dims, input_num_chars, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, character_map=None, bidir=True, subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class(input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) generators = [None, None] for i in range(2): # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top{}".format(i)) else: top = Identity(name='top{}'.format(i)) if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition{}".format(i)) else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}_{}".format( i, trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att" + i) elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att{}".format(i)) else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback( num_phonemes + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: feedback = OneOfNFeedback(num_phonemes + 1) if criterion['name'] == 'log_likelihood': emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter{}".format(i)) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() elif criterion['name'].startswith('mse'): emitter = RewardRegressionEmitter(criterion['name'], eos_label, num_phonemes, criterion.get( 'min_reward', -1.0), name="emitter") else: raise ValueError("Unknown criterion {}".format( criterion['name'])) readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout{}".format(i)) if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge{}'.format(i)) readout = Readout(**readout_config) language_model = None if lm and lm.get('path'): lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generators[i] = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator{}".format(i)) self.generator = generators[0] self.forward_to_backward = Linear(dim_dec, dim_dec) # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generators = generators self.children = [self.forward_to_backward, encoder, top, bottom ] + generators # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.n_steps = tensor.lscalar('n_steps')
def __init__(self, input_dims, input_num_chars, bos_label, eos_label, num_labels, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, token_map=None, bidir=True, window_size=None, max_length=None, subsample=None, dims_top=None, extra_input_dim=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, reuse_bottom_lookup_table=False, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, # for criterions involving generation of outputs, whether # or not they should be generated by the recognizer itself generate_predictions=True, compute_targets=True, extra_generation_steps=3, **kwargs): all_arguments = copy.deepcopy(locals()) all_arguments.update(copy.deepcopy(kwargs)) del all_arguments['kwargs'] del all_arguments['self'] if post_merge_activation is None: post_merge_activation = Tanh() super(EncoderDecoder, self).__init__(**kwargs) self.bos_label = bos_label self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.generate_predictions = generate_predictions self.extra_generation_steps = extra_generation_steps self.compute_targets = compute_targets self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class( input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if dims_bidir: if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) elif window_size: encoder = ConvEncoder( max_length, bottom.get_dim(bottom.apply.outputs[0]), window_size) else: raise ValueError("Don't know which Encoder to use") dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition( dim=dim_dec, activation=Tanh(), name="transition") else: assert not extra_input_dim transitions = [self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack)] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError("Unknown attention type {}" .format(attention_type)) if not embed_outputs: raise ValueError("embed_outputs=False is not supported any more") if not reuse_bottom_lookup_table: embedding = LookupTable(num_labels + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: embedding = bottom.children[0] feedback = Feedback( embedding=embedding, output_names=[s for s in transition.apply.sequences if s != 'mask']) # Create a readout readout_config = dict( num_tokens=num_labels, input_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], name="readout") if post_merge_dims: readout_config['merge_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence([ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [d//getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims] + [num_labels]).apply, ], name='post_merge') if 'reward' in criterion and criterion['name'] != 'log_likelihood': if criterion['reward'] == 'edit_distance': readout_config['reward_brick'] = EditDistanceReward( self.bos_label, self.eos_label) elif criterion['reward'] == 'delta_edit_distance': readout_config['reward_brick'] = EditDistanceReward( self.bos_label, self.eos_label, deltas=True) elif criterion['reward'] == 'bleu': readout_config['reward_brick'] = BleuReward( self.bos_label, self.eos_label, deltas=False) elif criterion['reward'] == 'delta_bleu': readout_config['reward_brick'] = BleuReward( self.bos_label, self.eos_label, deltas=True) else: raise ValueError("Unknown reward type") if criterion['name'] == 'log_likelihood': readout_class = SoftmaxReadout elif criterion['name'] == 'critic': readout_class = CriticReadout criterion_copy = dict(criterion) del criterion_copy['name'] readout_config.update(**criterion_copy) elif criterion['name'] == 'reinforce': readout_class = ReinforceReadout readout_config['merge_names'] = list(readout_config['input_names']) readout_config['entropy'] = criterion.get('entropy') readout_config['input_names'] += ['attended', 'attended_mask'] elif criterion['name'] in ['sarsa', 'actor_critic']: readout_class = ActorCriticReadout if criterion['name'] == 'actor_critic': critic_arguments = dict(all_arguments) # No worries, critic will not compute log likelihood values. # We critic_arguments['criterion'] = { 'name': 'critic', 'value_softmax': criterion.get('value_softmax'), 'same_value_for_wrong': criterion.get('same_value_for_wrong'), 'groundtruth_word_bonus': criterion.get('groundtruth_word_bonus'), 'dueling_outputs': criterion.get('dueling_outputs')} critic_arguments['name'] = 'critic' if criterion.get('critic_uses_actor_states'): critic_arguments['extra_input_dim'] = dim_dec if (criterion.get('value_softmax') or criterion.get('same_value_for_wrong') or criterion.get('dueling_outputs')): # Add an extra output for the critic critic_arguments['num_labels'] = num_labels + 1 if criterion.get('force_bidir'): critic_arguments['dims_bidir'] = [dim_dec] critic_arguments['reuse_bottom_lookup_table'] = True critic_arguments['input_num_chars'] = {'inputs': num_labels} if criterion.get('downsize_critic'): critic_arguments = _downsize_config( critic_arguments, criterion['downsize_critic']) critic = EncoderDecoder(**critic_arguments) readout_config['critic'] = critic readout_config['merge_names'] = list(readout_config['input_names']) readout_config['freeze_actor'] = criterion.get('freeze_actor') readout_config['freeze_critic'] = criterion.get('freeze_critic') readout_config['critic_uses_actor_states'] = criterion.get('critic_uses_actor_states') readout_config['critic_uses_groundtruth'] = criterion.get('critic_uses_groundtruth') readout_config['critic_burnin_steps'] = criterion.get('critic_burnin_steps') readout_config['critic_loss'] = criterion.get('critic_loss') readout_config['discount'] = criterion.get('discount') readout_config['entropy_reward_coof'] = criterion.get('entropy_reward_coof') readout_config['cross_entropy_reward_coof'] = criterion.get('cross_entropy_reward_coof') readout_config['value_penalty'] = criterion.get('value_penalty') readout_config['value_penalty_type'] = criterion.get('value_penalty_type') readout_config['critic_policy_t'] = criterion.get('critic_policy_t') readout_config['bos_token'] = bos_label readout_config['accumulate_outputs'] = criterion.get('accumulate_outputs') readout_config['use_value_biases'] = criterion.get('use_value_biases') readout_config['actor_grad_estimate'] = criterion.get('actor_grad_estimate') readout_config['input_names'] += ['attended', 'attended_mask'] # Note, that settings below are for the "clean" mode. # When get_cost_graph() is run with training=True, they # are temporarily overriden with the "real" settings from # "criterion" readout_config['compute_targets'] = True readout_config['trpo_coef'] = 0.0 readout_config['solve_bellman'] = True else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout = readout_class(**readout_config) if lm: raise ValueError("LM is currently not supported") recurrent = AttentionRecurrent(transition, attention) if extra_input_dim: recurrent = RecurrentWithExtraInput( recurrent, "extra_inputs", extra_input_dim, name="with_extra_inputs") generator = SequenceGenerator( recurrent=recurrent, readout=readout, feedback=feedback, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.softmax = Softmax() self.children = [encoder, top, bottom, generator, self.softmax] # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.predicted_labels = tensor.lmatrix('predicted_labels') self.predicted_mask = tensor.matrix('predicted_mask') self.prefix_labels = tensor.lmatrix('prefix_labels') self.prefix_steps = tensor.lscalar('prefix_steps') self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.single_predicted_labels = tensor.lvector('predicted_labels') self.n_steps = tensor.lscalar('n_steps') # Configure mixed_generate if criterion['name'] == 'actor_critic': critic = self.generator.readout.critic self.mixed_generate.sequences = [] self.mixed_generate.states = ( ['step'] + self.generator.recurrent.apply.states + ['critic_' + name for name in critic.generator.recurrent.apply.states]) self.mixed_generate.outputs = ( ['samples', 'step'] + self.generator.recurrent.apply.outputs + ['critic_' + name for name in critic.generator.recurrent.apply.outputs]) self.mixed_generate.contexts = ( self.generator.recurrent.apply.contexts + ['critic_' + name for name in critic.generator.recurrent.apply.contexts] + ['groundtruth', 'groundtruth_mask']) self.initial_states.outputs = self.mixed_generate.states self.prefix_generate.sequences = [] self.prefix_generate.states = ['step'] + self.generator.recurrent.apply.states self.prefix_generate.outputs = ['samples', 'step'] + self.generator.recurrent.apply.outputs self.prefix_generate.contexts = self.generator.recurrent.apply.contexts
def build_model_vanilla(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh()) for _ in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear( input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # We have # h = [state, state_1, state_2 ...] if layers > 1 # h = state if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} if layers > 1: # Save all the last states for d in range(layers): last_states[d] = h[d][-1, :, :] if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: last_states[0] = h[-1, :, :] h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def build_model_lstm(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections virtual_dim = 4 * state_dim # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) # Make sure time_length is what we need fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [ LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers) ] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear(input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} init_cells = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) init_cells[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='cell0_%d' % d) kwargs['states' + suffix] = init_states[d] kwargs['cells' + suffix] = init_cells[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} for d in range(layers): last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) updates.append((init_cells[d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = { "in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates } h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if layers > 1 # h = [state] if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if layers > 1: if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state" presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() # Dont initialize as Orthogonal if we are about to load new parameters if args.load_path is not None: rnn.weights_init = initialization.Constant(0) else: rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
class TargetWordEncoder(Initializable): """Word encoder in target side use a single RNN to map a charater-level word to a vector""" def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(TargetWordEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') self.children = [self.lookup, self.dgru, self.gru_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) if self.dgru_depth > 1: gru_out = gru_out[-1] sampled_representation = tensor.batched_dot( sample_matrix, gru_out.dimshuffle([1, 0, 2])) return sampled_representation.dimshuffle([1, 0, 2]) @application(inputs=['target_single_char']) def single_emit(self, target_single_char, batch_size, mask, states=None): # Time as first dimension # only one batch embeddings = self.lookup.apply(target_single_char) if states is None: states = self.dgru.initial_states(batch_size) states_dict = {'states': states[0]} for i in range(1, self.dgru_depth): states_dict['states' + RECURRENTSTACK_SEPARATOR + str(i)] = states[i] gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), states_dict, { 'mask': mask, 'iterate': False })) return gru_out @single_emit.property('outputs') def single_emit_outputs(self): return [ 'gru_out' + RECURRENTSTACK_SEPARATOR + str(i) for i in range(self.dgru_depth) ] def get_dim(self, name): if name in ['output', 'feedback']: return self.dgru_state_dim super(TargetWordEncoder, self).get_dim(name)
dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [4*hidden_size_recurrent] activations_theta = [Rectifier()]*depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta mlp_x = MLP(activations = activations_x, dims = dims_x, name = "mlp_x") transition = [LSTM(dim=hidden_size_recurrent, name = "lstm_{}".format(i) ) for i in range(depth_lstm)] transition = RecurrentStack( transition, name="transition", skip_connections = True) mlp_theta = MLP( activations = activations_theta, dims = dims_theta, name = "mlp_theta") mlp_gmm = GMMMLP(mlp = mlp_theta, dim = target_size, k = k, const = 0.00001, name = "gmm_wrap") gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) bricks = [mlp_x, transition, gmm_emitter]
class Interpolator(AbstractReadout): """Readout char by char.""" def __init__(self, vocab_size, embedding_dim, igru_state_dim, igru_depth, trg_dgru_depth, emitter, feedback_brick, merge=None, merge_prototype=None, post_merge=None, **kwargs): merged_dim = igru_state_dim if not merge: merge = Merge(input_names=kwargs['source_names'], prototype=merge_prototype) if not post_merge: post_merge = Bias(dim=merged_dim) # for compatible if igru_depth == 1: self.igru = IGRU(dim=igru_state_dim) else: self.igru = RecurrentStack( [IGRU(dim=igru_state_dim, name='igru')] + [ UpperIGRU(dim=igru_state_dim, activation=Tanh(), name='upper_igru' + str(i)) for i in range(1, igru_depth) ], skip_connections=True) self.embedding_dim = embedding_dim self.emitter = emitter self.feedback_brick = feedback_brick self.merge = merge self.post_merge = post_merge self.merged_dim = merged_dim self.igru_depth = igru_depth self.trg_dgru_depth = trg_dgru_depth self.lookup = LookupTable(name='embeddings') self.vocab_size = vocab_size self.igru_state_dim = igru_state_dim self.gru_to_softmax = Linear(input_dim=igru_state_dim, output_dim=vocab_size) self.gru_fork = Fork([ name for name in self.igru.apply.sequences if name != 'mask' and name != 'input_states' ], prototype=Linear(), name='gru_fork') children = [ self.emitter, self.feedback_brick, self.merge, self.post_merge, self.igru, self.lookup, self.gru_to_softmax, self.gru_fork ] kwargs.setdefault('children', []).extend(children) super(Interpolator, self).__init__(**kwargs) def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.emitter.readout_dim = self.get_dim('readouts') self.merge.input_names = self.source_names self.merge.input_dims = self.source_dims self.merge.output_dim = self.merged_dim self.post_merge.input_dim = self.merged_dim self.post_merge.output_dim = self.igru_state_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.igru.get_dim(name) for name in self.gru_fork.output_names ] @application def initial_igru_outputs(self, batch_size): return self.igru.initial_states(batch_size) @application def emit(self, readouts): return self.emitter.emit(readouts) @application def cost(self, readouts, outputs): return self.emitter.cost(readouts, outputs) @application def initial_outputs(self, batch_size): return self.emitter.initial_outputs(batch_size) @application(outputs=['feedback']) def feedback(self, outputs): return self.feedback_brick.feedback(outputs) @application(outputs=['feedback']) def feedback_apply(self, target_char_seq, target_sample_matrix, target_char_aux): return self.feedback_brick.apply(target_char_seq, target_sample_matrix, target_char_aux) @application def single_feedback(self, target_single_char, batch_size, mask=None, states=None): return self.feedback_brick.single_emit(target_single_char, batch_size, mask, states) @single_feedback.property('outputs') def single_feedback_outputs(self): return [ 'single_feedback' + RECURRENTSTACK_SEPARATOR + str(i) for i in range(self.trg_dgru_depth) ] @application(outputs=['gru_out', 'readout_chars']) def single_readout_gru(self, target_prev_char, target_prev_char_aux, input_states, states): embeddings = self.lookup.apply(target_prev_char) states_dict = {'states': states[0]} if self.igru_depth > 1: for i in range(1, self.igru_depth): states_dict['states' + RECURRENTSTACK_SEPARATOR + str(i)] = states[i] gru_out = self.igru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), states_dict, { 'mask': target_prev_char_aux, 'input_states': input_states, 'iterate': False })) if self.igru_depth > 1: readout_chars = self.gru_to_softmax.apply(gru_out[-1]) else: readout_chars = self.gru_to_softmax.apply(gru_out) return gru_out, readout_chars @application def readout(self, **kwargs): merged = self.merge.apply( **{name: kwargs[name] for name in self.merge.input_names}) merged = self.post_merge.apply(merged) return merged @application(outputs=['readout_chars']) def readout_gru(self, target_prev_char_seq, target_prev_char_aux, input_states): embeddings = self.lookup.apply(target_prev_char_seq) gru_out = self.igru.apply( **merge(self.gru_fork.apply(embeddings, as_dict=True), { 'mask': target_prev_char_aux, 'input_states': input_states })) if self.igru_depth > 1: gru_out = gru_out[-1] readout_chars = self.gru_to_softmax.apply(gru_out) return readout_chars def get_dim(self, name): if name == 'outputs': return self.emitter.get_dim(name) elif name == 'feedback': return self.feedback_brick.get_dim(name) elif name == 'readouts': return self.readout_dim return super(AbstractReadout, self).get_dim(name)
class TestRecurrentStack(unittest.TestCase): def setUp(self): depth = 4 self.depth = depth dim = 3 # don't change, hardwired in the code transitions = [LSTM(dim=dim) for _ in range(depth)] self.stack0 = RecurrentStack(transitions, weights_init=Constant(2), biases_init=Constant(0)) self.stack0.initialize() self.stack2 = RecurrentStack(transitions, weights_init=Constant(2), biases_init=Constant(0), skip_connections=True) self.stack2.initialize() def do_one_step(self, stack, skip_connections=False, low_memory=False): depth = self.depth # batch=2 h0_val = 0.1 * numpy.array([[[1, 1, 0], [0, 1, 1]]] * depth, dtype=theano.config.floatX) c0_val = 0.1 * numpy.array([[[1, 1, 0], [0, 1, 1]]] * depth, dtype=theano.config.floatX) x_val = 0.1 * numpy.array([range(12), range(12, 24)], dtype=theano.config.floatX) # we will use same weights on all layers W_state2x_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX) kwargs = OrderedDict() for d in range(depth): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or skip_connections: kwargs['inputs' + suffix] = tensor.matrix('inputs' + suffix) kwargs['inputs' + suffix].tag.test_value = x_val kwargs['states' + suffix] = tensor.matrix('states' + suffix) kwargs['states' + suffix].tag.test_value = h0_val[d] kwargs['cells' + suffix] = tensor.matrix('cells' + suffix) kwargs['cells' + suffix].tag.test_value = c0_val[d] results = stack.apply(iterate=False, low_memory=low_memory, **kwargs) next_h = theano.function(inputs=list(kwargs.values()), outputs=results) def sigmoid(x): return 1. / (1. + numpy.exp(-x)) h1_val = [] x_v = x_val args_val = [] for d in range(depth): if d == 0 or skip_connections: args_val.append(x_val) h0_v = h0_val[d] args_val.append(h0_v) c0_v = c0_val[d] args_val.append(c0_v) # omitting biases because they are zero activation = numpy.dot(h0_v, W_state_val) + x_v if skip_connections and d > 0: activation += x_val i_t = sigmoid(activation[:, :3] + c0_v * W_cell_to_in) f_t = sigmoid(activation[:, 3:6] + c0_v * W_cell_to_forget) next_cells = f_t * c0_v + i_t * numpy.tanh(activation[:, 6:9]) o_t = sigmoid(activation[:, 9:12] + next_cells * W_cell_to_out) h1_v = o_t * numpy.tanh(next_cells) # current layer output state transformed to input of next x_v = numpy.dot(h1_v, W_state2x_val) h1_val.append(h1_v) res = next_h(*args_val) for d in range(depth): assert_allclose(h1_val[d], res[d * 2], rtol=1e-6) def test_one_step(self): self.do_one_step(self.stack0) self.do_one_step(self.stack0, low_memory=True) self.do_one_step(self.stack2, skip_connections=True) self.do_one_step(self.stack2, skip_connections=True, low_memory=True) def do_many_steps(self, stack, skip_connections=False, low_memory=False): depth = self.depth # 24 steps # 4 batch examples # 12 dimensions per step x_val = (0.1 * numpy.asarray( list(itertools.islice(itertools.permutations(range(12)), 0, 24)), dtype=theano.config.floatX)) x_val = numpy.ones((24, 4, 12), dtype=theano.config.floatX) * x_val[:, None, :] # mask the last third of steps mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 # unroll all states and cells for all steps and also initial value h_val = numpy.zeros((depth, 25, 4, 3), dtype=theano.config.floatX) c_val = numpy.zeros((depth, 25, 4, 3), dtype=theano.config.floatX) # we will use same weights on all layers W_state2x_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX) kwargs = OrderedDict() for d in range(depth): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or skip_connections: kwargs['inputs' + suffix] = tensor.tensor3('inputs' + suffix) kwargs['inputs' + suffix].tag.test_value = x_val kwargs['mask'] = tensor.matrix('mask') kwargs['mask'].tag.test_value = mask_val results = stack.apply(iterate=True, low_memory=low_memory, **kwargs) calc_h = theano.function(inputs=list(kwargs.values()), outputs=results) def sigmoid(x): return 1. / (1. + numpy.exp(-x)) for i in range(1, 25): x_v = x_val[i - 1] h_vs = [] c_vs = [] for d in range(depth): h_v = h_val[d][i - 1, :, :] c_v = c_val[d][i - 1, :, :] activation = numpy.dot(h_v, W_state_val) + x_v if skip_connections and d > 0: activation += x_val[i - 1] i_t = sigmoid(activation[:, :3] + c_v * W_cell_to_in) f_t = sigmoid(activation[:, 3:6] + c_v * W_cell_to_forget) c_v1 = f_t * c_v + i_t * numpy.tanh(activation[:, 6:9]) o_t = sigmoid(activation[:, 9:12] + c_v1 * W_cell_to_out) h_v1 = o_t * numpy.tanh(c_v1) h_v = (mask_val[i - 1, :, None] * h_v1 + (1 - mask_val[i - 1, :, None]) * h_v) c_v = (mask_val[i - 1, :, None] * c_v1 + (1 - mask_val[i - 1, :, None]) * c_v) # current layer output state transformed to input of next x_v = numpy.dot(h_v, W_state2x_val) h_vs.append(h_v) c_vs.append(c_v) for d in range(depth): h_val[d][i, :, :] = h_vs[d] c_val[d][i, :, :] = c_vs[d] args_val = [x_val]*(depth if skip_connections else 1) + [mask_val] res = calc_h(*args_val) for d in range(depth): assert_allclose(h_val[d][1:], res[d * 2], rtol=1e-4) assert_allclose(c_val[d][1:], res[d * 2 + 1], rtol=1e-4) # Also test that initial state is a parameter for h in results: initial_states = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert all(is_shared_variable(initial_state) for initial_state in initial_states) def test_many_steps(self): self.do_many_steps(self.stack0) self.do_many_steps(self.stack0, low_memory=True) self.do_many_steps(self.stack2, skip_connections=True) self.do_many_steps(self.stack2, skip_connections=True, low_memory=True)
def __init__(self, vocab_size, embedding_dim, dgru_state_dim, igru_state_dim, state_dim, representation_dim, transition_depth, trg_igru_depth, trg_dgru_depth, trg_space_idx, trg_bos, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.igru_state_dim = igru_state_dim self.state_dim = state_dim self.trg_space_idx = trg_space_idx self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = RecurrentStack([ GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder_gru_withinit') ] + [ GatedRecurrent( dim=state_dim, activation=Tanh(), name='decoder_gru' + str(i)) for i in range(1, transition_depth) ], skip_connections=False) # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") self.interpolator = Interpolator( vocab_size=vocab_size, embedding_dim=embedding_dim, igru_state_dim=igru_state_dim, igru_depth=trg_igru_depth, trg_dgru_depth=trg_dgru_depth, source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=trg_bos, theano_seed=theano_seed), feedback_brick=TargetWordEncoder(vocab_size, embedding_dim, self.dgru_state_dim, trg_dgru_depth)) # Build sequence generator accordingly self.sequence_generator = SequenceGeneratorDCNMT( trg_space_idx=self.trg_space_idx, readout=self.interpolator, transition=self.transition, attention=self.attention, transition_depth=transition_depth, igru_depth=trg_igru_depth, trg_dgru_depth=trg_dgru_depth, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
activations_theta = [Rectifier()] * depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta mlp_x = MLP(activations=activations_x, dims=dims_x) feedback = DeepTransitionFeedback(mlp=mlp_x) transition = [ GatedRecurrent(dim=hidden_size_recurrent, name="gru_{}".format(i)) for i in range(depth_recurrent) ] transition = RecurrentStack(transition, name="transition", skip_connections=True) mlp_theta = MLP(activations=activations_theta, dims=dims_theta) mlp_gmm = GMMMLP(mlp=mlp_theta, dim=target_size, k=k, const=0.00001) emitter = GMMEmitter(gmmmlp=mlp_gmm, output_size=frame_size, k=k, name="emitter") source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout(readout_dim=hidden_size_recurrent, source_names=source_names, emitter=emitter,
def __init__( self, recordings_source, labels_source, eos_label, num_features, num_phonemes, dim_dec, dims_bidir, dims_bottom, enc_transition, dec_transition, use_states_for_readout, attention_type, lm=None, character_map=None, subsample=None, dims_top=None, prior=None, conv_n=None, bottom_activation=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, energy_normalizer=None, # softmax is th edefault set in SequenceContentAndConvAttention **kwargs): if bottom_activation is None: bottom_activation = Tanh() if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.recordings_source = recordings_source self.labels_source = labels_source self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack bottom_activation = bottom_activation post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN if dims_bottom: bottom = MLP([bottom_activation] * len(dims_bottom), [num_features] + dims_bottom, name="bottom") else: bottom = Identity(name='bottom') # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder( self.enc_transition, dims_bidir, dims_bottom[-1] if len(dims_bottom) else num_features, subsample) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec) else: feedback = OneOfNFeedback(num_phonemes + 1) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() else: emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") readout_config = dict(readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm: lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.recordings = tensor.tensor3(self.recordings_source) self.recordings_mask = tensor.matrix(self.recordings_source + "_mask") self.labels = tensor.lmatrix(self.labels_source) self.labels_mask = tensor.matrix(self.labels_source + "_mask") self.batch_inputs = [ self.recordings, self.recordings_source, self.labels, self.labels_mask ] self.single_recording = tensor.matrix(self.recordings_source) self.single_transcription = tensor.lvector(self.labels_source)
def main_rnn(config): x = tensor.tensor3('features') y = tensor.matrix('targets') # if 'LSTM' in config['model'] : # from models import getLSTMstack # y_hat = getLSTMstack(input_dim=13, input_var=x, depth=int(config['model'][-1])) # else : # raise Exception("These are not the LSTM we are looking for") # y_hat = model.apply(x) emitter = TestEmitter() # emitter = TrivialEmitter(readout_dim=config['lstm_hidden_size']) # cost_func = SquaredError() # @application # def qwe(self, readouts, outputs=None): # print(type(self), type(readouts)) # x = cost_func.apply(readouts,outputs) # return x print(type(emitter.cost)) # emitter.cost = qwe # print(type(qwe)) steps = 2 n_samples= config['target_size'] transition = [LSTM(config['lstm_hidden_size']) for _ in range(4)] transition = RecurrentStack(transition, name="transition", skip_connections=False) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout(emitter, readout_dim=config['lstm_hidden_size'], source_names=source_names,feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None) seqgen = SequenceGenerator(readout, transition, attention=None, add_contexts=False) seqgen.weights_init = IsotropicGaussian(0.01) seqgen.biases_init = Constant(0.) seqgen.push_initialization_config() seqgen.transition.biases_init = IsotropicGaussian(0.01,1) seqgen.transition.push_initialization_config() seqgen.initialize() states = seqgen.transition.apply.outputs print('states',states) states = {name: shared_floatx_zeros((n_samples, config['lstm_hidden_size'])) for name in states} cost_matrix = seqgen.cost_matrix(x, **states) cost = cost_matrix.mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) #Cost # cost = SquaredError().apply(y_hat ,y) #cost = CategoricalCrossEntropy().apply(T.flatten(),Y) # #for sampling #cg = ComputationGraph(seqgen.generate(n_steps=steps,batch_size=n_samples, iterate=True)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=config['learning_rate'])) #Getting the stream train_stream = MFCC.get_stream(config['batch_size'],config['source_size'],config['target_size'],config['num_examples']) #Monitoring stuff extensions = [Timing(), FinishAfter(after_n_batches=config['num_batches']), #DataStreamMonitoring([cost, error_rate],test_stream,prefix="test"), TrainingDataMonitoring([cost], prefix="train", every_n_batches=1), #Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=1)] main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) main_loop.run()
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
bos_token = None, eos_token = None, unk_token = '<UNK>', level = 'character') alphabet_size = len(dictionary.keys()) lstm_dim = 512 lstm1 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal()) lstm2 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal()) lstm3 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal()) rnn = RecurrentStack([lstm1, lstm2, lstm3], name="transition") readout = Readout(readout_dim = alphabet_size, source_names=["states#2"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(alphabet_size, feedback_dim=alphabet_size, name="feedback"), name="readout") seq_gen = SequenceGenerator(readout=readout, transition=rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator")
def build_model_soft(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] # Build the MLP dims = [2 * state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear( input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
'c': 3, '<UNK>': 4 }, bos_token=None, eos_token=None, unk_token='<UNK>', level='character') alphabet_size = 4 lstm_dim = 2 lstm1 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal()) lstm2 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal()) rnn = RecurrentStack([lstm1, lstm2], name="transition") readout = Readout(readout_dim=alphabet_size, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(alphabet_size, feedback_dim=alphabet_size, name="feedback"), name="readout") seq_gen = SequenceGenerator(readout=readout, transition=rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator")
[4*hidden_size_recurrent] activations_theta = [Rectifier()] * depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta mlp_x = MLP(activations=activations_x, dims=dims_x, name="mlp_x") transition = [ LSTM(dim=hidden_size_recurrent, name="lstm_{}".format(i)) for i in range(depth_lstm) ] transition = RecurrentStack(transition, name="transition", skip_connections=True) mlp_theta = MLP(activations=activations_theta, dims=dims_theta, name="mlp_theta") mlp_gmm = GMMMLP(mlp=mlp_theta, dim=target_size, k=k, const=0.00001, name="gmm_wrap") gmm_emitter = GMMEmitter(gmmmlp=mlp_gmm, output_size=frame_size, k=k) bricks = [mlp_x, transition, gmm_emitter]
def build_model_soft(args, dtype=floatX): logger.info('Building model ...') # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn, x_mask = get_prernn(args) transitions = [SimpleRecurrent(dim=args.state_dim, activation=Tanh())] # Build the MLP dims = [2 * args.state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(args.state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(args.layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=args.state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=args.skip_connections) initialize_rnn(rnn, args) # Prepare inputs and initial states for the RNN kwargs, inits = get_rnn_kwargs(pre_rnn, args) # Apply the RNN to the inputs h = rnn.apply(low_memory=True, mask=x_mask, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} hidden_states = [] for d in range(args.layers): h[d] = h[d] * x_mask last_states[d] = h[d][-1, :, :] h[d].name = "hidden_state_" + str(d) hidden_states.append(h[d]) # Concatenate all the states if args.layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state_all" # The updates of the hidden states updates = [] for d in range(args.layers): updates.append((inits[0][d], last_states[d])) presoft = get_presoft(h, args) cost, cross_entropy = get_costs(presoft, args) return cost, cross_entropy, updates, gate_values, hidden_states