def getRnnGenerator(vocab_size,hidden_dim,input_dim=512): """ "Apply" the RNN to the input x For initializing the network, the vocab size needs to be known Default of the hidden layer is set tot 512 like Karpathy """ generator = SequenceGenerator( Readout(readout_dim = vocab_size, source_names = ["states"], # transition.apply.states ??? emitter = SoftmaxEmitter(name="emitter"), feedback_brick = LookupFeedback( vocab_size, input_dim, name = 'feedback' ), name = "readout" ), MySimpleRecurrent( name = "transition", activation = Tanh(), dim = hidden_dim ), weights_init = IsotropicGaussian(0.01), biases_init = Constant(0), name = "generator" ) generator.push_initialization_config() generator.transition.weights_init = IsotropicGaussian(0.01) generator.initialize() return generator
def build_model(alphabet_size, config): layers = config['lstm_layers'] dimensions = [config['lstm_dim_' + str(i)] for i in range(layers)] uniform_width = config['lstm_init_width'] stack = [] for dim in dimensions: stack.append(LSTM(dim=dim, use_bias=True, weights_init = Uniform(width=uniform_width), forget_init=Constant(1.))) recurrent_stack = RecurrentStack(stack, name='transition') readout = Readout(readout_dim=alphabet_size, source_names=['states#' + str(layers - 1)], emitter=SoftmaxEmitter(name='emitter'), feedback_brick=LookupFeedback(alphabet_size, feedback_dim=alphabet_size, name='feedback'), name='readout') generator = SequenceGenerator(readout=readout, transition=recurrent_stack, weights_init=Uniform(width=uniform_width), biases_init=Constant(0), name='generator') generator.push_initialization_config() generator.initialize() x = tensor.lmatrix('features') mask = tensor.fmatrix('features_mask') cost_matrix = generator.cost_matrix(x, mask=mask) log2e = math.log(math.e, 2) if 'batch_length' in config: length = config['batch_length'] - config['batch_overlap'] cost = log2e * aggregation.mean(cost_matrix[:,-length:].sum(), mask[:,-length:].sum()) else: cost = log2e * aggregation.mean(cost_matrix[:,:].sum(), mask[:,:].sum()) cost.name = 'bits_per_character' return generator, cost
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating a Markov chain with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument( "prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument( "--steps", type=int, default=100, help="Number of steps to plot") args = parser.parse_args() dim = 10 num_states = ChainIterator.num_states feedback_dim = 8 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator( LinearReadout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": rng = numpy.random.RandomState(1) batch_size = 50 generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.debug("transition.weights_init={}".format( transition.weights_init)) cost = generator.cost(tensor.lmatrix('x')).sum() gh_model = GroundhogModel(generator, cost) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() data = ChainIterator(rng, 100, batch_size) trainer = SGD(gh_model, state, data) main_loop = MainLoop(data, None, None, gh_model, trainer, state, None) main_loop.main() elif args.mode == "sample": load_params(generator, args.prefix + "model.npz") sample = ComputationGraph(generator.generate( n_steps=args.steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainIterator.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainIterator.trans_prob)) else: assert False
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean(generator.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop( algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(save_path, every_n_batches=500), Printing(every_n_batches=100)]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(theano.config.floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=theano.config.floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
readout = Readout(readout_dim=alphabet_size, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(alphabet_size, feedback_dim=alphabet_size, name="feedback"), name="readout") seq_gen = SequenceGenerator(readout=readout, transition=rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") seq_gen.push_initialization_config() rnn.weights_init = Orthogonal() seq_gen.initialize() # z markov_tutorial x = tensor.lvector('features') x = x.reshape((x.shape[0], 1)) cost = aggregation.mean(seq_gen.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "negative log-likelihood" cost_cg = ComputationGraph(cost) print VariableFilter(roles=[WEIGHT])(cost_cg.variables) # theano.printing.pydotprint(cost, outfile="./pics/symbolic_graph_unopt.png", var_with_name_simple=True) algorithm = GradientDescent(cost=cost, parameters=list(
def train(): if os.path.isfile('trainingdata.tar'): with open('trainingdata.tar', 'rb') as f: main = load(f) else: hidden_size = 512 filename = 'warpeace.hdf5' encoder = HDF5CharEncoder('warpeace_input.txt', 1000) encoder.write(filename) alphabet_len = encoder.length x = theano.tensor.lmatrix('x') readout = Readout( readout_dim=alphabet_len, feedback_brick=LookupFeedback(alphabet_len, hidden_size, name='feedback'), source_names=['states'], emitter=RandomSoftmaxEmitter(), name='readout' ) transition = GatedRecurrent( activation=Tanh(), dim=hidden_size) transition.weights_init = IsotropicGaussian(0.01) gen = SequenceGenerator(readout=readout, transition=transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='sequencegenerator') gen.push_initialization_config() gen.initialize() cost = gen.cost(outputs=x) cost.name = 'cost' cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(0.5)) train_set = encoder.get_dataset() train_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme( train_set.num_examples, batch_size=128)) main = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(), Printing(), Checkpoint('trainingdata.tar', every_n_epochs=10), ShowOutput(every_n_epochs=10) ]) main.run()
def main(mode, save_path, num_batches, from_dump): if mode == "train": # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Data processing pipeline data_stream = DataStreamMapping( mapping=lambda data: tuple(array.T for array in data), data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=lambda data: len(data[0]) <= 100, data_stream=OneBillionWord( "training", [99], char2code, level="character", preprocess=str.lower).get_default_stream()))))) # Build the model chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") encoder = Bidirectional(GatedRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) encoder.initialize() fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.fork_dims = {name: dimension for name in fork.fork_names} fork.initialize() lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) lookup.initialize() transition = Transition(activation=Tanh(), dim=dimension, attended_dim=2 * dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() bricks = [encoder, fork, lookup, generator] # Give an idea of what's going on params = Selector(bricks).get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Build the cost computation graph batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) energies = unpack(VariableFilter(application=readout.readout, name="output")(cg.variables), singleton=True) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy(activations.mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule([ GradientClipping(10.0), SteepestDescent(0.01) ])) observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) main_loop = MainLoop( model=bricks, data_stream=data_stream, algorithm=algorithm, extensions=([LoadFromDump(from_dump)] if from_dump else []) + [ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), TrainingDataMonitoring( observables, prefix="average", every_n_batches=10), FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", lambda log: math.isnan( log.current_row.total_gradient_norm)), Plot(os.path.basename(save_path), [["average_" + cost.name], ["average_" + cost_per_character.name]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": with open(save_path, "rb") as source: encoder, fork, lookup, generator = dill.load(source) logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) sample_function = ComputationGraph(generated).get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=lambda tuple_: -tuple_[0]) for _, message in messages: print(message)
def main(mode, save_path, steps, time_budget, reset): num_states = ChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator( LinearReadout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( ChainDataset.entropy)) logger.info("Expected min error: {}".format( -ChainDataset.entropy * seq_len * batch_size)) if os.path.isfile(save_path) and not reset: model = Pylearn2Model.load(save_path) else: model = Pylearn2Model(generator) # Build the cost computation graph. # Note: would be probably nicer to make cost part of the model. x = tensor.ltensor3('x') cost = Pylearn2Cost(model.brick.cost(x[:, :, 0]).sum()) dataset = ChainDataset(rng, seq_len) sgd = SGD(learning_rate=0.0001, cost=cost, batch_size=batch_size, batches_per_iter=10, monitoring_dataset=dataset, monitoring_batch_size=batch_size, monitoring_batches=1, learning_rule=Pylearn2LearningRule( SGDLearningRule(), dict(training_objective=cost.cost))) train = Pylearn2Train(dataset, model, algorithm=sgd, save_path=save_path, save_freq=10) train.main_loop(time_budget=time_budget) elif mode == "sample": model = Pylearn2Model.load(save_path) generator = model.brick sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainDataset.trans_prob)) else: assert False
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating a Markov chain with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument("prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument("--steps", type=int, default=100, help="Number of steps to plot") args = parser.parse_args() dim = 10 num_states = ChainIterator.num_states feedback_dim = 8 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator(LinearReadout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": rng = numpy.random.RandomState(1) batch_size = 50 generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.debug("transition.weights_init={}".format( transition.weights_init)) cost = generator.cost(tensor.lmatrix('x')).sum() gh_model = GroundhogModel(generator, cost) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() data = ChainIterator(rng, 100, batch_size) trainer = SGD(gh_model, state, data) main_loop = MainLoop(data, None, None, gh_model, trainer, state, None) main_loop.main() elif args.mode == "sample": load_params(generator, args.prefix + "model.npz") sample = ComputationGraph( generator.generate(n_steps=args.steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainIterator.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainIterator.trans_prob)) else: assert False
def __init__(self, config, vocab_size): context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) context_bag = to_bag(context, vocab_size) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') #embed.weights_init = Constant(embeddings_initial_value) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Build the encoder bricks transition = GatedRecurrent(activation=Tanh(), dim=config.generator_lstm_size, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=cenc_dim, match_dim=config.generator_lstm_size, name="attention") readout = Readout(readout_dim=vocab_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=MaskedSoftmaxEmitter(context_bag=context_bag, name='emitter'), feedback_brick=LookupFeedback( vocab_size, config.feedback_size), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") cost = generator.cost(answer, answer_mask.astype(theano.config.floatX), attended=cenc, attended_mask=context_mask.astype( theano.config.floatX), name="cost") self.predictions = generator.generate( n_steps=7, batch_size=config.batch_size, attended=cenc, attended_mask=context_mask.astype(theano.config.floatX), iterate=True)[1] # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # initialize new stuff manually (change!) generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0) generator.push_allocation_config() generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator(Readout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean( generator.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop(algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[ FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(save_path, every_n_batches=500), Printing(every_n_batches=100) ]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph( generator.generate(n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
def main(mode, save_path, steps, num_batches, load_params): chars = (list(string.ascii_uppercase) + list(range(10)) + [' ', '.', ',', '\'', '"', '!', '?', '<UNK>']) char_to_ind = {char: i for i, char in enumerate(chars)} ind_to_char = {v: k for k, v in char_to_ind.iteritems()} train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'], char_to_ind, bos_token=None, eos_token=None, level='character') valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'], char_to_ind, bos_token=None, eos_token=None, level='character') vocab_size = len(char_to_ind) logger.info('Dictionary size: {}'.format(vocab_size)) if mode == 'continue': continue_training(save_path) return elif mode == "sample": main_loop = load(open(save_path, "rb")) generator = main_loop.model.get_top_bricks()[-1] sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] print("".join([ind_to_char[s] for s in outputs])) numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] return # Experiment configuration batch_size = 20 dim = 650 feedback_dim = 650 valid_stream = valid_dataset.get_example_stream() valid_stream = Batch(valid_stream, iteration_scheme=ConstantScheme(batch_size)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=vocab_size, source_names=transition.apply.states, emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=Uniform(std=0.04), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() transition.push_initialization_config() generator.initialize() # Build the cost computation graph. features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') cost_matrix = generator.cost_matrix( features, mask=features_mask) batch_cost = cost_matrix.sum() cost = aggregation.mean( batch_cost, features.shape[1]) cost.name = "sequence_log_likelihood" char_cost = aggregation.mean( batch_cost, features_mask.sum()) char_cost.name = 'character_log_likelihood' ppl = 2 ** (cost / numpy.log(2)) ppl.name = 'ppl' bits_per_char = char_cost / tensor.log(2) bits_per_char.name = 'bits_per_char' length = features.shape[0] length.name = 'length' model = Model(batch_cost) if load_params: params = load_parameter_values(save_path) model.set_parameter_values(params) if mode == "train": # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_parameters().items()], width=120)) train_stream = train_dataset.get_example_stream() train_stream = Mapping(train_stream, _truncate) train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) train_stream = Mapping(train_stream, _transpose) parameters = model.get_parameter_dict() maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values()) algorithm = GradientDescent( cost=batch_cost, parameters=parameters.values(), step_rule=CompositeRule([StepClipping(1000.), AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects) ])) ft = features[:6, 0] ft.name = 'feature_example' observables = [cost, ppl, char_cost, length, bits_per_char] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) track_the_best_bpc = TrackTheBest('valid_bits_per_char') root_path, extension = os.path.splitext(save_path) this_step_monitoring = TrainingDataMonitoring( observables + [ft], prefix="this_step", after_batch=True) average_monitoring = TrainingDataMonitoring( observables + [algorithm.total_step_norm, algorithm.total_gradient_norm], prefix="average", every_n_batches=10) valid_monitoring = DataStreamMonitoring( observables, prefix="valid", every_n_batches=1500, before_training=False, data_stream=valid_stream) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=model, extensions=[ this_step_monitoring, average_monitoring, valid_monitoring, track_the_best_bpc, Checkpoint(save_path, ), Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_bpc.notification_name), (root_path + "_best" + extension,)), Timing(after_batch=True), Printing(every_n_batches=10), Plot(root_path, [[average_monitoring.record_name(cost), valid_monitoring.record_name(cost)], [average_monitoring.record_name(algorithm.total_step_norm)], [average_monitoring.record_name(algorithm.total_gradient_norm)], [average_monitoring.record_name(ppl), valid_monitoring.record_name(ppl)], [average_monitoring.record_name(char_cost), valid_monitoring.record_name(char_cost)], [average_monitoring.record_name(bits_per_char), valid_monitoring.record_name(bits_per_char)]], every_n_batches=10) ]) main_loop.run() elif mode == 'evaluate': with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f: raw_words = [line.split()[1:-1] for line in f.readlines()] words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] for w in raw_words] max_word_length = max([len(w) for w in words]) initial_states = tensor.matrix('init_states') cost_matrix_step = generator.cost_matrix(features, mask=features_mask, states=initial_states) cg = ComputationGraph(cost_matrix_step) states = cg.auxiliary_variables[-2] compute_cost = theano.function([features, features_mask, initial_states], [cost_matrix_step.sum(axis=0), states]) cost_matrix = generator.cost_matrix(features, mask=features_mask) initial_cg = ComputationGraph(cost_matrix) initial_states = initial_cg.auxiliary_variables[-2] total_word_cost = 0 num_words = 0 examples = numpy.zeros((max_word_length + 1, len(words)), dtype='int64') all_masks = numpy.zeros((max_word_length + 1, len(words)), dtype=floatX) for i, word in enumerate(words): examples[:len(word), i] = word all_masks[:len(word), i] = 1. single_space = numpy.array([char_to_ind[' ']])[:, None] for batch in valid_stream.get_epoch_iterator(): for example, mask in equizip(batch[0].T, batch[1].T): example = example[:(mask.sum())] spc_inds = list(numpy.where(example == char_to_ind[" "])[0]) state = generator.transition.transition.initial_states_.get_value()[None, :] for i, j in equizip([-1] + spc_inds, spc_inds + [-1]): word = example[(i+1):j, None] word_cost, states = compute_cost( word, numpy.ones_like(word, dtype=floatX), state) state = states[-1] costs = numpy.exp(-compute_cost( examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0]) _, space_states = compute_cost( single_space, numpy.ones_like(single_space, dtype=floatX), state) state = space_states[-1] word_prob = numpy.exp(-word_cost) total_word_cost += word_cost + numpy.log(numpy.sum(costs)) num_words += 1 print(word_prob) print(numpy.sum(costs)) print("Average cost", total_word_cost / num_words) print("PPL", numpy.exp(total_word_cost / num_words)) print("Word-level perplexity") print(total_word_cost / num_words) else: assert False
def main_rnn(config): x = tensor.tensor3('features') y = tensor.matrix('targets') # if 'LSTM' in config['model'] : # from models import getLSTMstack # y_hat = getLSTMstack(input_dim=13, input_var=x, depth=int(config['model'][-1])) # else : # raise Exception("These are not the LSTM we are looking for") # y_hat = model.apply(x) emitter = TestEmitter() # emitter = TrivialEmitter(readout_dim=config['lstm_hidden_size']) # cost_func = SquaredError() # @application # def qwe(self, readouts, outputs=None): # print(type(self), type(readouts)) # x = cost_func.apply(readouts,outputs) # return x print(type(emitter.cost)) # emitter.cost = qwe # print(type(qwe)) steps = 2 n_samples= config['target_size'] transition = [LSTM(config['lstm_hidden_size']) for _ in range(4)] transition = RecurrentStack(transition, name="transition", skip_connections=False) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout(emitter, readout_dim=config['lstm_hidden_size'], source_names=source_names,feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None) seqgen = SequenceGenerator(readout, transition, attention=None, add_contexts=False) seqgen.weights_init = IsotropicGaussian(0.01) seqgen.biases_init = Constant(0.) seqgen.push_initialization_config() seqgen.transition.biases_init = IsotropicGaussian(0.01,1) seqgen.transition.push_initialization_config() seqgen.initialize() states = seqgen.transition.apply.outputs print('states',states) states = {name: shared_floatx_zeros((n_samples, config['lstm_hidden_size'])) for name in states} cost_matrix = seqgen.cost_matrix(x, **states) cost = cost_matrix.mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) #Cost # cost = SquaredError().apply(y_hat ,y) #cost = CategoricalCrossEntropy().apply(T.flatten(),Y) # #for sampling #cg = ComputationGraph(seqgen.generate(n_steps=steps,batch_size=n_samples, iterate=True)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=config['learning_rate'])) #Getting the stream train_stream = MFCC.get_stream(config['batch_size'],config['source_size'],config['target_size'],config['num_examples']) #Monitoring stuff extensions = [Timing(), FinishAfter(after_n_batches=config['num_batches']), #DataStreamMonitoring([cost, error_rate],test_stream,prefix="test"), TrainingDataMonitoring([cost], prefix="train", every_n_batches=1), #Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=1)] main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) main_loop.run()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating a Markov chain with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument( "save_path", default="sine", help="The part to save PyLearn2 model") parser.add_argument( "--steps", type=int, default=100, help="Number of steps to plot") parser.add_argument( "--reset", action="store_true", default=False, help="Start training from scratch") args = parser.parse_args() num_states = ChainDataset.num_states if args.mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator( LinearReadout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.debug("Markov chain entropy: {}".format( ChainDataset.entropy)) logger.debug("Expected min error: {}".format( -ChainDataset.entropy * seq_len * batch_size)) if os.path.isfile(args.save_path) and not args.reset: model = Pylearn2Model.load(args.save_path) else: model = Pylearn2Model(generator) # Build the cost computation graph. # Note: would be probably nicer to make cost part of the model. x = tensor.ltensor3('x') cost = Pylearn2Cost(model.brick.cost(x[:, :, 0]).sum()) dataset = ChainDataset(rng, seq_len) sgd = SGD(learning_rate=0.0001, cost=cost, batch_size=batch_size, batches_per_iter=10, monitoring_dataset=dataset, monitoring_batch_size=batch_size, monitoring_batches=1, learning_rule=Pylearn2LearningRule( SGDLearningRule(), dict(training_objective=cost.cost))) train = Pylearn2Train(dataset, model, algorithm=sgd, save_path=args.save_path, save_freq=10) train.main_loop() elif args.mode == "sample": model = Pylearn2Model.load(args.save_path) generator = model.brick sample = ComputationGraph(generator.generate( n_steps=args.steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainDataset.trans_prob)) else: assert False
def train(): if os.path.isfile('trainingdata.tar'): with open('trainingdata.tar', 'rb') as f: main = load(f) else: hidden_size = 512 train_dataset = dataset.T_H5PYDataset( 'dataset/wikifonia-seqlen-100.txt.hdf5', which_sets=('train', )) alphabet_len = train_dataset.vocab_size() x = theano.tensor.lmatrix('inchar') recurrent_block = LSTM(dim=hidden_size, activation=Tanh()) recurrent_block2 = LSTM(dim=hidden_size, activation=Tanh()) recurrent_block3 = LSTM(dim=hidden_size, activation=Tanh()) transition = RecurrentStack( [recurrent_block, recurrent_block2, recurrent_block3]) readout = Readout(readout_dim=alphabet_len, feedback_brick=LookupFeedback(alphabet_len, hidden_size, name='feedback'), source_names=[ thing for thing in transition.apply.states if "states" in thing ], emitter=RandomSoftmaxEmitter(), name='readout') gen = SequenceGenerator(readout=readout, transition=transition, weights_init=Uniform(width=0.02), biases_init=Uniform(width=0.0001), name='sequencegenerator') gen.push_initialization_config() gen.initialize() cost = gen.cost(outputs=x) cost.name = 'cost' cg = ComputationGraph(cost) step_rules = [Adam(), StepClipping(1.0)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules), on_unused_sources='ignore') train_stream = DataStream.default_stream( train_dataset, iteration_scheme=SequentialScheme(train_dataset.num_examples, batch_size=20)) main = MainLoop(model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(), Printing(), Checkpoint('trainingdata.tar', every_n_epochs=10), ShowOutput(every_n_epochs=10) ]) main.run()
name="emitter") source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout(readout_dim=hidden_size_recurrent, source_names=source_names, emitter=emitter, feedback_brick=feedback, name="readout") generator = SequenceGenerator(readout=readout, transition=transition, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.) generator.push_initialization_config() generator.transition.biases_init = IsotropicGaussian(0.01, 1) generator.transition.push_initialization_config() generator.initialize() states = {} states = generator.transition.apply.outputs states = { name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states } cost_matrix = generator.cost_matrix(x, **states)
state_names=source_names, state_dims=[hidden_size_recurrent], attended_dim=context_size, name="attention" ) readout = Readout( readout_dim=hidden_size_recurrent, source_names=source_names + ["feedback"] + ["glimpses"], emitter=emitter, feedback_brick=feedback, name="readout", ) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.0) generator.push_initialization_config() generator.transition.biases_init = IsotropicGaussian(0.01, 1) generator.transition.push_initialization_config() generator.initialize() lookup.weights_init = IsotropicGaussian(0.001) lookup.biases_init = Constant(0.0) lookup.initialize() # states = {} states = [state for state in generator.transition.apply.outputs if state != "step"] # ipdb.set_trace()
readout = Readout(readout_dim = alphabet_size, source_names=["states#2"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(alphabet_size, feedback_dim=alphabet_size, name="feedback"), name="readout") seq_gen = SequenceGenerator(readout=readout, transition=rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") seq_gen.push_initialization_config() rnn.weights_init = Orthogonal() seq_gen.initialize() # z markov_tutorial x = tensor.lvector('features') x = x.reshape( (x.shape[0], 1) ) cost = aggregation.mean(seq_gen.cost_matrix(x[:,:]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" cost_cg = ComputationGraph(cost) # theano.printing.pydotprint(cost, outfile="./pics/symbolic_graph_unopt.png", var_with_name_simple=True) algorithm = GradientDescent( cost=cost, parameters=list(Selector(seq_gen).get_parameters().values()),