def get_ngram_stream(ngram_order, which_set, which_partitions, vocabulary): """Return an iterator over n-grams. Notes ----- This reads the text files sequentially. However, note that the files are already shuffled. """ # Construct data stream logger.info('Constructing data stream') dataset = OneBillionWord(which_set, which_partitions, vocabulary) data_stream = dataset.get_example_stream() n_gram_stream = NGrams(6, data_stream) return n_gram_stream
def get_sentence_stream(which_set, which_partitions, vocabulary): """Return an iterator over sentences Notes ----- This reads the text files sequentially. However, note that the files are already shuffled. """ # Construct data stream logger.info('Constructing data stream') dataset = OneBillionWord(which_set, which_partitions, vocabulary) data_stream = dataset.get_example_stream() # Get rid of long sentences that don't fit data_stream = Filter(data_stream, _filter_long) # Creates the dataset "targets" data_stream = Mapping(data_stream, _shift_words, add_sources=("targets",)) return data_stream
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weghts_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( application=generator.readout.readout, name="output")(cg.variables) (activations,) = VariableFilter( application=generator.transition.apply, name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_param_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(model_path, recurrent_type): dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, _make_target, add_sources=('target',)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(100)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') target = tensor.lmatrix('target') target_mask = tensor.matrix('target_mask') dim = 100 lookup = LookupTable(len(all_chars), dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) if recurrent_type == 'lstm': rnn = LSTM(dim / 4, Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) elif recurrent_type == 'simple': rnn = SimpleRecurrent(dim, Tanh()) rnn = Bidirectional(rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) else: raise ValueError('Not known RNN type') rnn.initialize() lookup.initialize() y_hat = rnn.apply(lookup.apply(features), mask=features_mask) print len(all_chars) linear = Linear(2 * dim, len(all_chars), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) linear.initialize() y_hat = linear.apply(y_hat) seq_lenght = y_hat.shape[0] batch_size = y_hat.shape[1] y_hat = Softmax().apply(y_hat.reshape((seq_lenght * batch_size, -1))).reshape(y_hat.shape) cost = CategoricalCrossEntropy().apply( target.flatten(), y_hat.reshape((-1, len(all_chars)))) * seq_lenght * batch_size cost.name = 'cost' cost_per_character = cost / features_mask.sum() cost_per_character.name = 'cost_per_character' cg = ComputationGraph([cost, cost_per_character]) model = Model(cost) algorithm = GradientDescent(step_rule=Adam(), cost=cost, params=cg.parameters) train_monitor = TrainingDataMonitoring( [cost, cost_per_character], prefix='train', after_batch=True) extensions = [train_monitor, Printing(every_n_batches=40), Dump(model_path, every_n_batches=200), #Checkpoint('rnn.pkl', every_n_batches=200) ] main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=data_stream, extensions=extensions) main_loop.run()