def build_evaluator(model_name): config = importlib.import_module('.%s' % model_name, 'config') config.batch_size = 1 config.shuffle_questions = False # Build datastream valid_path = os.path.join(os.getcwd(), "squad_rare/dev-v1.0_tokenized.json") vocab_path = os.path.join(os.getcwd(), "squad_rare/vocab.txt") import data ds, valid_stream = data.setup_squad_datastream(valid_path, vocab_path, config) dump_path = os.path.join("model_params", model_name + ".pkl") # Build model m = config.Model(config, ds.vocab_size) model = Model(m.sgd_cost) if os.path.isfile(dump_path): with open(dump_path, 'r') as f: print "Analysing %s from best dump" % (model_name) model.set_parameter_values(cPickle.load(f)) else: print "Analysing %s with random parameters" % (model_name) evaluator = DatasetEvaluator(m.analyse_vars) return evaluator, valid_stream, ds
def create_act_table(self, save_to, act_table): batch_size = 500 image_size = (28, 28) output_size = 10 convnet = create_lenet_5() layers = convnet.layers x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outmap = OrderedDict((full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])( cg.variables)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Generate parallel array, in the same order, for outputs outs = [outmap[full_brick_name(get_brick(b))] for b in biases] # Figure work count error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) max_activation_table = (MaxActivationTable().apply( outs).copy(name='max_activation_table')) max_activation_table.tag.aggregation_scheme = ( Concatenate(max_activation_table)) model = Model([ error_rate, max_activation_table]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) mnist_test_stream = DataStream.default_stream( self.mnist_test, iteration_scheme=SequentialScheme( self.mnist_test.num_examples, batch_size)) evaluator = DatasetEvaluator([ error_rate, max_activation_table ]) results = evaluator.evaluate(mnist_test_stream) table = results['max_activation_table'] pickle.dump(table, open(act_table, 'wb')) return table
def test_model(): x = tensor.matrix('x') mlp1 = MLP([Tanh(), Tanh()], [10, 20, 30], name="mlp1") mlp2 = MLP([Tanh()], [30, 40], name="mlp2") h1 = mlp1.apply(x) h2 = mlp2.apply(h1) model = Model(h2) assert model.get_top_bricks() == [mlp1, mlp2] # The order of parameters returned is deterministic but # not sensible. assert list(model.get_parameter_dict().items()) == [ ('/mlp2/linear_0.b', mlp2.linear_transformations[0].b), ('/mlp1/linear_1.b', mlp1.linear_transformations[1].b), ('/mlp1/linear_0.b', mlp1.linear_transformations[0].b), ('/mlp1/linear_0.W', mlp1.linear_transformations[0].W), ('/mlp1/linear_1.W', mlp1.linear_transformations[1].W), ('/mlp2/linear_0.W', mlp2.linear_transformations[0].W) ] # Test getting and setting parameter values mlp3 = MLP([Tanh()], [10, 10]) mlp3.allocate() model3 = Model(mlp3.apply(x)) parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones( (10, 10), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(10, dtype=theano.config.floatX) } model3.set_parameter_values(parameter_values) assert numpy.all( mlp3.linear_transformations[0].parameters[0].get_value() == 2) assert numpy.all( mlp3.linear_transformations[0].parameters[1].get_value() == 3) got_parameter_values = model3.get_parameter_values() assert len(got_parameter_values) == len(parameter_values) for name, value in parameter_values.items(): assert_allclose(value, got_parameter_values[name]) # Test exception is raised if parameter shapes don't match def helper(): parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones( (11, 11), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(11, dtype=theano.config.floatX) } model3.set_parameter_values(parameter_values) assert_raises(ValueError, helper) # Test name conflict handling mlp4 = MLP([Tanh()], [10, 10]) def helper(): Model(mlp4.apply(mlp3.apply(x))) assert_raises(ValueError, helper)
def run_visualizations(cost, updates, train_stream, valid_stream, args, hidden_states=None, gate_values=None): # Load the parameters from a dumped model assert args.load_path is not None model = Model(cost) model.set_parameter_values(load_parameter_values(args.load_path)) # Run a visualization if args.visualize == "generate": visualize_generate(cost, hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "gates" and (gate_values is not None): if args.rnn_type == "lstm": visualize_gates_lstm(gate_values, hidden_states, updates, train_stream, valid_stream, args) elif args.rnn_type == "soft": visualize_gates_soft(gate_values, hidden_states, updates, train_stream, valid_stream, args) else: assert False elif args.visualize == "states": visualize_states(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "gradients": visualize_gradients(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "jacobian": visualize_jacobian(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "presoft": visualize_presoft(cost, hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "matrices": visualize_matrices(args) elif args.visualize == "trained_singular_values": visualize_singular_values(args) elif args.visualize == "gradients_flow_pie": visualize_gradients_flow_pie(hidden_states, updates, args) else: assert False
def fine_tuning(cost, args): param_values = load_parameter_values(args.fine_tuning) param_values[ "/output_layer.W"] = np.concatenate(( param_values["/output_layer.W"], 0.1 * np.random.randn(args.state_dim, 40).astype(np.float32))) model = Model(cost) model.set_parameter_values(param_values) return cost
def test_model(): x = tensor.matrix('x') mlp1 = MLP([Tanh(), Tanh()], [10, 20, 30], name="mlp1") mlp2 = MLP([Tanh()], [30, 40], name="mlp2") h1 = mlp1.apply(x) h2 = mlp2.apply(h1) model = Model(h2) assert model.get_top_bricks() == [mlp1, mlp2] # The order of parameters returned is deterministic but # not sensible. assert list(model.get_parameter_dict().items()) == [ ('/mlp2/linear_0.b', mlp2.linear_transformations[0].b), ('/mlp1/linear_1.b', mlp1.linear_transformations[1].b), ('/mlp1/linear_0.b', mlp1.linear_transformations[0].b), ('/mlp1/linear_0.W', mlp1.linear_transformations[0].W), ('/mlp1/linear_1.W', mlp1.linear_transformations[1].W), ('/mlp2/linear_0.W', mlp2.linear_transformations[0].W)] # Test getting and setting parameter values mlp3 = MLP([Tanh()], [10, 10]) mlp3.allocate() model3 = Model(mlp3.apply(x)) parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones((10, 10), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(10, dtype=theano.config.floatX)} model3.set_parameter_values(parameter_values) assert numpy.all( mlp3.linear_transformations[0].parameters[0].get_value() == 2) assert numpy.all( mlp3.linear_transformations[0].parameters[1].get_value() == 3) got_parameter_values = model3.get_parameter_values() assert len(got_parameter_values) == len(parameter_values) for name, value in parameter_values.items(): assert_allclose(value, got_parameter_values[name]) # Test exception is raised if parameter shapes don't match def helper(): parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones((11, 11), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(11, dtype=theano.config.floatX)} model3.set_parameter_values(parameter_values) assert_raises(ValueError, helper) # Test name conflict handling mlp4 = MLP([Tanh()], [10, 10]) def helper(): Model(mlp4.apply(mlp3.apply(x))) assert_raises(ValueError, helper)
def testing(self, fea2obj): config = self._config dsdir = config['dsdir'] devfile = dsdir + '/dev.txt' testfile = dsdir + '/test.txt' networkfile = config['net'] batch_size = 10000#int(config['batchsize']) devMentions = load_ent_ds(devfile) tstMentions = load_ent_ds(testfile) logger.info('#dev: %d #test: %d', len(devMentions), len(tstMentions)) main_loop = load(networkfile + '.best.pkl') logger.info('Model loaded. Building prediction function...') old_model = main_loop.model logger.info(old_model.inputs) sources = [inp.name for inp in old_model.inputs] # fea2obj = build_input_objs(sources, config) t2idx = fea2obj['targets'].t2idx deterministic = str_to_bool(config['use_mean_pred']) if 'use_mean_pred' in config else True kl_weight = shared_floatx(0.001, 'kl_weight') entropy_weight= shared_floatx(0.001, 'entropy_weight') cost, _, y_hat, _, _,_,_ = build_model_new(fea2obj, len(t2idx), self._config, kl_weight, entropy_weight, deterministic=deterministic, test=True) model = Model(cost) model.set_parameter_values(old_model.get_parameter_values()) theinputs = [] for fe in fea2obj.keys(): if 'targets' in fe: continue for inp in model.inputs: if inp.name == fe: theinputs.append(inp) # theinputs = [inp for inp in model.inputs if inp.name != 'targets'] print "theinputs: ", theinputs predict = theano.function(theinputs, y_hat) test_stream, num_samples_test = get_comb_stream(fea2obj, 'test', batch_size, shuffle=False) dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size, shuffle=False) logger.info('sources: %s -- number of test/dev samples: %d/%d', test_stream.sources, num_samples_test, num_samples_dev) idx2type = {idx:t for t,idx in t2idx.iteritems()} logger.info('Starting to apply on dev inputs...') self.applypredict(theinputs, predict, dev_stream, devMentions, num_samples_dev, batch_size, os.path.join(config['exp_dir'], config['matrixdev']), idx2type) logger.info('...apply on dev data finished') logger.info('Starting to apply on test inputs...') self.applypredict(theinputs, predict, test_stream, tstMentions, num_samples_test, batch_size, os.path.join(config['exp_dir'], config['matrixtest']), idx2type) logger.info('...apply on test data finished')
def create_act_table(self, save_to, act_table): batch_size = 500 image_size = (28, 28) output_size = 10 convnet = create_lenet_5() layers = convnet.layers x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outmap = OrderedDict( (full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Generate parallel array, in the same order, for outputs outs = [outmap[full_brick_name(get_brick(b))] for b in biases] # Figure work count error_rate = (MisclassificationRate().apply( y.flatten(), probs).copy(name='error_rate')) max_activation_table = (MaxActivationTable().apply(outs).copy( name='max_activation_table')) max_activation_table.tag.aggregation_scheme = ( Concatenate(max_activation_table)) model = Model([error_rate, max_activation_table]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) mnist_test_stream = DataStream.default_stream( self.mnist_test, iteration_scheme=SequentialScheme(self.mnist_test.num_examples, batch_size)) evaluator = DatasetEvaluator([error_rate, max_activation_table]) results = evaluator.evaluate(mnist_test_stream) table = results['max_activation_table'] pickle.dump(table, open(act_table, 'wb')) return table
def main(mode, save_to, num_epochs, load_params=None, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, stride=None, repeat_times=None, batch_size=None, num_batches=None, algo=None, test_set=None, valid_examples=None, dropout=None, max_norm=None, weight_decay=None, batch_norm=None): if feature_maps is None: feature_maps = [20, 50, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5, 5] if pool_sizes is None: pool_sizes = [2, 2, 2] if repeat_times is None: repeat_times = [1, 1, 1] if batch_size is None: batch_size = 500 if valid_examples is None: valid_examples = 2500 if stride is None: stride = 1 if test_set is None: test_set = 'test' if algo is None: algo = 'rmsprop' if batch_norm is None: batch_norm = False image_size = (128, 128) output_size = 2 if (len(feature_maps) != len(conv_sizes) or len(feature_maps) != len(pool_sizes) or len(feature_maps) != len(repeat_times)): raise ValueError("OMG, inconsistent arguments") # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, stride=stride, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), repeat_times=repeat_times, top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', batch_norm=batch_norm, weights_init=Glorot(), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) single_x = tensor.tensor3('image_features') x = tensor.tensor4('image_features') single_y = tensor.lvector('targets') y = tensor.lmatrix('targets') # Training with batch_normalization(convnet): probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) extra_updates = [] if batch_norm: # batch norm: logger.debug("Apply batch norm") pop_updates = get_batch_normalization_updates(cg) # p stands for population mean # m stands for minibatch alpha = 0.005 extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] population_statistics = [p for p, m in extra_updates] if dropout: relu_outputs = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg) cg = apply_dropout(cg, relu_outputs, dropout) cost, error_rate = cg.outputs if weight_decay: logger.debug("Apply weight decay {}".format(weight_decay)) cost += weight_decay * l2_norm(cg.parameters) cost.name = 'cost' # Validation valid_probs = convnet.apply_5windows(single_x) valid_cost = (CategoricalCrossEntropy().apply(single_y, valid_probs) .copy(name='cost')) valid_error_rate = (MisclassificationRate().apply( single_y, valid_probs).copy(name='error_rate')) model = Model([cost, error_rate]) if load_params: logger.info("Loaded params from {}".format(load_params)) with open(load_params, 'r') as src: model.set_parameter_values(load_parameters(src)) # Training stream with random cropping train = DogsVsCats(("train",), subset=slice(None, 25000 - valid_examples, None)) train_str = DataStream( train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size)) train_str = add_transformers(train_str, random_crop=True) # Validation stream without cropping valid = DogsVsCats(("train",), subset=slice(25000 - valid_examples, None, None)) valid_str = DataStream( valid, iteration_scheme=SequentialExampleScheme(valid.num_examples)) valid_str = add_transformers(valid_str) if mode == 'train': directory, _ = os.path.split(sys.argv[0]) env = dict(os.environ) env['THEANO_FLAGS'] = 'floatX=float32' port = numpy.random.randint(1025, 10000) server = subprocess.Popen( [directory + '/server.py', str(25000 - valid_examples), str(batch_size), str(port)], env=env, stderr=subprocess.STDOUT) train_str = ServerDataStream( ('image_features', 'targets'), produces_examples=False, port=port) save_to_base, save_to_extension = os.path.splitext(save_to) # Train with simple SGD if algo == 'rmsprop': step_rule = RMSProp(decay_rate=0.999, learning_rate=0.0003) elif algo == 'adam': step_rule = Adam() else: assert False if max_norm: conv_params = VariableFilter(bricks=[Convolutional], roles=[WEIGHT])(cg) linear_params = VariableFilter(bricks=[Linear], roles=[WEIGHT])(cg) step_rule = CompositeRule( [step_rule, Restrict(VariableClipping(max_norm, axis=0), linear_params), Restrict(VariableClipping(max_norm, axis=(1, 2, 3)), conv_params)]) algorithm = GradientDescent( cost=cost, parameters=model.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(every_n_batches=100), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [valid_cost, valid_error_rate], valid_str, prefix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), TrackTheBest("valid_error_rate"), Checkpoint(save_to, save_separately=['log'], parameters=cg.parameters + (population_statistics if batch_norm else []), before_training=True, after_epoch=True) .add_condition( ['after_epoch'], OnLogRecord("valid_error_rate_best_so_far"), (save_to_base + '_best' + save_to_extension,)), Printing(every_n_batches=100)] model = Model(cost) main_loop = MainLoop( algorithm, train_str, model=model, extensions=extensions) try: main_loop.run() finally: server.terminate() elif mode == 'test': classify = theano.function([single_x], valid_probs.argmax()) test = DogsVsCats((test_set,)) test_str = DataStream( test, iteration_scheme=SequentialExampleScheme(test.num_examples)) test_str = add_transformers(test_str) correct = 0 with open(save_to, 'w') as dst: print("id", "label", sep=',', file=dst) for index, example in enumerate(test_str.get_epoch_iterator()): image = example[0] prediction = classify(image) print(index + 1, classify(image), sep=',', file=dst) if len(example) > 1 and prediction == example[1]: correct += 1 print(correct / float(test.num_examples)) else: assert False
for p in cg.parameters: print(str(p), p.shape, p.dtype) print("Created ComputationGraph, inputs:"); print(cg.inputs) # Strangely, all the examples use : DataStreamMonitoring in MainLoop model = Model(labels) print("Model.dict_of_inputs():"); print(model.dict_of_inputs()) print("Model list inputs:"); print([ v.name for v in model.inputs]) ## Model loading from saved file model.set_parameter_values(load_parameter_values(save_state_path)) examine_embedding(lookup.W.get_value()) label_ner = model.get_theano_function() print(model.inputs) print("printed label_ner.params") for test_data in data_stream.get_epoch_iterator(): ordered_batch = test_data[0:3] # Explicitly strip off the pre-defined labels #print(ordered_batch) results = label_ner(*ordered_batch) #print(results) # This is a pure array of labels inputs = _transpose(ordered_batch)
def __init__(self, save_to): batch_size = 500 image_size = (28, 28) output_size = 10 convnet = create_lenet_5() layers = convnet.layers mnist_test = MNIST(("test", ), sources=['features', 'targets']) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outmap = OrderedDict( (full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Generate parallel array, in the same order, for outputs outs = [outmap[full_brick_name(get_brick(b))] for b in biases] # Figure work count error_rate = (MisclassificationRate().apply( y.flatten(), probs).copy(name='error_rate')) sensitive_unit_count = (SensitiveUnitCount().apply( y.flatten(), probs, biases).copy(name='sensitive_unit_count')) sensitive_unit_count.tag.aggregation_scheme = ( Concatenate(sensitive_unit_count)) active_unit_count = (ActiveUnitCount().apply(outs).copy( name='active_unit_count')) active_unit_count.tag.aggregation_scheme = ( Concatenate(active_unit_count)) ignored_unit_count = (IgnoredUnitCount().apply( y.flatten(), probs, biases, outs).copy(name='ignored_unit_count')) ignored_unit_count.tag.aggregation_scheme = ( Concatenate(ignored_unit_count)) model = Model([ error_rate, sensitive_unit_count, active_unit_count, ignored_unit_count ]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, batch_size)) evaluator = DatasetEvaluator([ error_rate, sensitive_unit_count, active_unit_count, ignored_unit_count ]) results = evaluator.evaluate(mnist_test_stream) def save_ranked_image(scores, filename): sorted_instances = scores.argsort() filmstrip = Filmstrip(image_shape=(28, 28), grid_shape=(100, 100)) for i, index in enumerate(sorted_instances): filmstrip.set_image((i // 100, i % 100), mnist_test.get_data(request=index)[0]) filmstrip.save(filename) save_ranked_image(results['sensitive_unit_count'], 'sensitive.jpg') save_ranked_image(results['active_unit_count'], 'active.jpg') save_ranked_image(results['ignored_unit_count'], 'ignored.jpg')
#get the test stream from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme, SequentialExampleScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, MaximumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift size = (128,128) cats = DogsVsCats(('test',)) stream = DataStream.default_stream(cats, iteration_scheme=SequentialExampleScheme(cats.num_examples)) stream_upscale = MaximumImageDimensions(stream, size, which_sources=('image_features',)) stream_scale = ScaleAndShift(stream_upscale, 1./255, 0, which_sources=('image_features',)) stream_data = Cast(stream_scale, dtype='float32', which_sources=('image_features',)) #Load the parameters of the model params = load_parameter_values('convnet_parameters.pkl') mo = Model(predict) mo.set_parameter_values(params) #Create the forward propagation function fprop = function(mo.inputs, mo.outputs[0], allow_input_downcast=True) tab = [] i = 1 #Get the prediction for each example of the test set for data in stream_data.get_epoch_iterator(): predict = np.argmax(fprop(data)) tab.append([i, predict]) print str(i) + "," + str(predict) i = i + 1 #Save predictions in a csv file np.savetxt("dump.csv", tab, delimiter=",", fmt='%d')
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets", )) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost(chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies, ) = VariableFilter(applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations, ) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy( abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, parameter in parameters.items(): observables.append(named_copy(parameter.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[parameter].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring(observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_parameter_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter(bricks=[reverser.generator], name="outputs")(ComputationGraph( generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(samples) outputs, costs = beam_search.search({chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
from blocks.roles import WEIGHT, PARAMETER, add_role from blocks.serialization import load weights = VariableFilter(roles=[WEIGHT])(cg.variables) apply_noise_weights = True #if apply_noise_weights: #cg = apply_noise(cg,weights, 0.01) # print 453.922119141 # #print function([f0, sp, voiced, start_flag], cg.outputs[0])(x_tr[0],x_tr[1],x_tr[2],x_tr[3]) # print function([f0, sp, voiced, start_flag], cg.outputs[0])(x_tr[0],x_tr[1],x_tr[2],x_tr[3]) loaded_main_loop = load(save_dir+"pkl/best_"+load_from+".pkl") loaded_model = loaded_main_loop.model loaded_params = loaded_model.get_parameter_values() model.set_parameter_values(loaded_params) del loaded_model del loaded_main_loop del loaded_params variances = [weight.get_value().std() for weight in weights ] prior_variances = [variance for variance in variances] from blocks.utils import shared_floatx variances = [shared_floatx(variance, 'std') for variance in variances] for variance in variances: add_role(variance,PARAMETER)
def generate_embeddings(config, tar_path, part, dest_path, format_, average=False, encoder_embeddings=None, **kwargs): """ generate embeddings for all the defintions, average them and serialize OR if encoder_embeddings, serialize the models' encoder embeddings config: name of the config of the model tar_path: tar path of the model parameters part: part of the dataset (should be either 'train', 'valid', 'test' or 'all') dest_path: directory where the serialized embeddings will be written format: either 'dict' or 'glove' encoder_embeddings: None, 'only', 'mixed', 'if_missing' - None: don't include encoder embeddings - 'only': don't read any data, just serialize the encoder embeddings - 'mixed': add the encoder embeddings to the list of definition embeddings - 'if_missing': add the encoder embeddings when there is no corresponding def average: if true, multi-prototype embeddings will be averaged """ if not os.path.exists(dest_path): os.makedirs(dest_path) c = config data, model = initialize_data_and_model(c, train_phase=False) words = T.ltensor3('words') words_mask = T.matrix('words_mask') keys = T.lmatrix('keys') n_identical_keys = T.lvector('n_identical_keys') sym_args = [words, words_mask] if format_ not in ['dict', 'glove']: raise ValueError("format should be either: dict, glove") if not c['encoder'] and encoder_embeddings != 'only': raise ValueError('Error: this model does not have an encoder.') if use_keys(c): sym_args.append(keys) if use_n_identical_keys(c): sym_args.append(n_identical_keys) costs = model.apply(*sym_args, train_phase=False) cg = Model(costs) with open(tar_path) as src: cg.set_parameter_values(load_parameters(src)) if encoder_embeddings: if encoder_embeddings == 'only' and not c['encoder']: embeddings_array = model.get_def_embeddings_params('key').eval() else: embeddings_array = model.get_def_embeddings_params('main').eval() entries = model.get_embeddings_entries() enc_embeddings = { e: np.asarray(a) for e, a in zip(entries, embeddings_array) } if encoder_embeddings == 'only': serialize_embeddings(enc_embeddings, format_, dest_path, "encoder_embeddings") return 0 embeddings_var, = VariableFilter(name='embeddings')(cg) compute = dict({"embeddings": embeddings_var}) if c['proximity_coef'] != 0: prox_var, = VariableFilter(name='proximity_term')(cg) compute["proximity_term"] = prox_var print "sym args", sym_args predict_f = theano.function(sym_args, compute) batch_size = 256 # size of test_unseen stream = data.get_stream(part, batch_size=batch_size, max_length=c['max_length'], remove_keys=False, remove_n_identical_keys=False) raw_data = [] # list of dicts containing the inputs and computed outputs i = 0 vocab = model._vocab print "start computing" embeddings = defaultdict(list) for input_data in stream.get_epoch_iterator(as_dict=True): if i % 10 == 0: print "iteration:", i words = input_data['words'] words_mask = input_data['words_mask'] keys = input_data['keys'] n_identical_keys = input_data['n_identical_keys'] args = [words, words_mask] if use_keys(c): args.append(keys) if use_n_identical_keys(c): args.append(n_identical_keys) to_save = predict_f(*args) for k, h in zip(keys, to_save['embeddings']): key = vec2str(k) if encoder_embeddings == 'if_missing': try: del enc_embeddings[key] except KeyError: pass embeddings[key].append(h) i += 1 if encoder_embeddings in ['mixed', 'if_missing']: for k, e in enc_embeddings.iteritems(): embeddings[k].append(e) if encoder_embeddings == 'mixed': prefix_fname = 'mix_e_' elif encoder_embeddings == 'if_missing': prefix_fname = 'if_mis_e_' else: prefix_fname = '' # combine: if average: mean_embeddings = {} for k in embeddings.keys(): mean_embeddings[k] = np.mean(np.asarray(embeddings[k]), axis=0) serialize_embeddings(mean_embeddings, format_, dest_path, prefix_fname + "mean_embeddings") else: serialize_embeddings(embeddings, format_, dest_path, prefix_fname + "embeddings")
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() layers = convnet.layers logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32') mnist_test = MNIST(("test", ), sources=['features', 'targets']) basis = create_fair_basis(mnist_test, 10, 10) # state = mnist_test.open() # # basis = numpy.zeros((100, 1, 28, 28), dtype=theano.config.floatX) # counters = [0] * 10 # index = 0 # while min(counters) < 10: # feature, target = mnist_test.get_data(state=state, request=[index]) # target = target[0, 0] # feature = feature / 256 # if counters[target] < 10: # basis[target + counters[target] * 10, :, :, :] = feature[0, :, :, :] # counters[target] += 1 # index += 1 # mnist_test.close(state=state) # b = shared_floatx(basis) # random_init = numpy.rand.random(100, 1000) # r = shared_floatx(random_init) # rn = r / r.norm(axis=1) # x = tensor.dot(rn, tensor.shape_padright(b)) x = tensor.tensor4('features') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter(roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) fn = theano.function([x], outs) results = fn(basis) for snapshots, output in zip(results, outs): layer = get_brick(output) filmstrip = Filmstrip(basis.shape[-2:], (snapshots.shape[1], snapshots.shape[0]), background='purple') if layer in layers: fieldmap = layerarray_fieldmap(layers[0:layers.index(layer) + 1]) for unit in range(snapshots.shape[1]): for index in range(snapshots.shape[0]): mask = make_mask( basis.shape[-2:], fieldmap, numpy.clip(snapshots[index, unit, :, :], 0, numpy.inf)) filmstrip.set_image((unit, index), basis[index, :, :, :], mask) filmstrip.save(layer.name + '_show.jpg')
print "time for building the model: " + str(time2 - time1) ######### load model and get results ################## n_epochs = "15" if "n_epochs" in config: n_epochs = config["n_epochs"] finalNetworkfile = networkfile + "." + n_epochs model = Model([cost]) f = open(finalNetworkfile) old_main_loop = load(f) f.close() old_model = old_main_loop.model model.set_parameter_values(old_model.get_parameter_values()) extensions = [] algorithm = None extensions.append( GetPRcurve(layer3=layer3, y=y, model=model, data_stream=data_stream_test, num_samples=len(bag_size_list_test), batch_size=bag_size_list_test, before_training=True)) my_loop = MainLoop(model=model, data_stream=data_stream, algorithm=algorithm, extensions=extensions)
def train(self, X, Y, idx_folds, hyper_params, model_prefix, verbose=False): import os from collections import OrderedDict from fuel.datasets import IndexableDataset from blocks.model import Model from blocks.bricks import Linear, Softmax from blocks.bricks.conv import MaxPooling from blocks.initialization import Uniform from deepthought.bricks.cost import HingeLoss import numpy as np import theano from theano import tensor assert model_prefix is not None fold_weights_filename = '{}_weights.npy'.format(model_prefix) # convert Y to one-hot encoding n_classes = len(set(Y)) Y = np.eye(n_classes, dtype=int)[Y] features = tensor.matrix('features', dtype=theano.config.floatX) targets = tensor.lmatrix('targets') input_ = features dim = X.shape[-1] # optional additional layers if self.pipeline_factory is not None: # need to re-shape flattened input to restore bc01 format input_shape = (input_.shape[0],) + hyper_params['classifier_input_shape'] # tuple, uses actual batch size input_ = input_.reshape(input_shape) pipeline = self.pipeline_factory.build_pipeline(input_shape, hyper_params) input_ = pipeline.apply(input_) input_ = input_.flatten(ndim=2) # this is very hacky, but there seems to be no elegant way to obtain a value for dim dummy_fn = theano.function(inputs=[features], outputs=input_) dummy_out = dummy_fn(X[:1]) dim = dummy_out.shape[-1] if hyper_params['classifier_pool_width'] > 1: # FIXME: this is probably broken! # c = hyper_params['num_components'] # input_ = input_.reshape((input_.shape[0], c, input_.shape[-1] // c, 1)) # restore bc01 # need to re-shape flattened input to restore bc01 format input_shape = hyper_params['classifier_pool_input_shape'] # tuple input_ = input_.reshape(input_shape) pool = MaxPooling(name='pool', input_dim=input_shape[1:], # (c, X.shape[-1] // c, 1), pooling_size=(hyper_params['classifier_pool_width'], 1), step=(hyper_params['classifier_pool_stride'], 1)) input_ = pool.apply(input_) input_ = input_.reshape((input_.shape[0], tensor.prod(input_.shape[1:]))) dim = np.prod(pool.get_dim('output')) linear = Linear(name='linear', input_dim=dim, output_dim=n_classes, weights_init=Uniform(mean=0, std=0.01), use_bias=False) linear.initialize() softmax = Softmax('softmax') probs = softmax.apply(linear.apply(input_)) prediction = tensor.argmax(probs, axis=1) model = Model(probs) # classifier with raw probability outputs predict = theano.function([features], prediction) # ready-to-use predict function if os.path.isfile(fold_weights_filename): # load filter weights from existing file fold_weights = np.load(fold_weights_filename) print 'loaded filter weights from', fold_weights_filename else: # train model from blocks.bricks.cost import MisclassificationRate from blocks.filter import VariableFilter from blocks.graph import ComputationGraph from blocks.roles import WEIGHT from blocks.bricks import Softmax from blocks.model import Model from blocks.algorithms import GradientDescent, Adam from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.predicates import OnLogRecord from fuel.streams import DataStream from fuel.schemes import SequentialScheme, ShuffledScheme from blocks.monitoring import aggregation from blocks.main_loop import MainLoop from blocks.extensions.training import TrackTheBest from deepthought.extensions.parameters import BestParams # from deepthought.datasets.selection import DatasetMetaDB init_param_values = model.get_parameter_values() cost = HingeLoss().apply(targets, probs) # Note: this requires just the class labels, not in a one-hot encoding error_rate = MisclassificationRate().apply(targets.argmax(axis=1), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) # L1 regularization if hyper_params['classifier_l1wdecay'] > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + hyper_params['classifier_l1wdecay'] * sum([abs(W).sum() for W in weights]) cost.name = 'cost' # iterate over trial folds fold_weights = [] fold_errors = [] # for ifi, ifold in fold_generator.get_inner_cv_folds(outer_fold): # # train_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['train']) # valid_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['valid']) # # metadb = DatasetMetaDB(meta, train_selectors.keys()) # # # get selected trial IDs # train_idx = metadb.select(train_selectors) # valid_idx = metadb.select(valid_selectors) for train_idx, valid_idx in idx_folds: # print train_idx # print valid_idx trainset = IndexableDataset(indexables=OrderedDict( [('features', X[train_idx]), ('targets', Y[train_idx])])) validset = IndexableDataset(indexables=OrderedDict( [('features', X[valid_idx]), ('targets', Y[valid_idx])])) model.set_parameter_values(init_param_values) best_params = BestParams() best_params.add_condition(['after_epoch'], predicate=OnLogRecord('error_rate_valid_best_so_far')) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) extensions = [Timing(), FinishAfter(after_n_epochs=hyper_params['classifier_max_epochs']), DataStreamMonitoring( [cost, error_rate], DataStream.default_stream( validset, iteration_scheme=SequentialScheme( validset.num_examples, hyper_params['classifier_batch_size'])), suffix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], suffix="train", after_epoch=True), TrackTheBest('error_rate_valid'), best_params # after TrackTheBest! ] if verbose: extensions.append(Printing()) # optional extensions.append(ProgressBar()) main_loop = MainLoop( algorithm, DataStream.default_stream( trainset, iteration_scheme=ShuffledScheme(trainset.num_examples, hyper_params['classifier_batch_size'])), model=model, extensions=extensions) main_loop.run() fold_weights.append(best_params.values['/linear.W']) fold_errors.append(main_loop.status['best_error_rate_valid']) # break # FIXME fold_errors = np.asarray(fold_errors).squeeze() print 'simple NN fold classification errors:', fold_errors fold_weights = np.asarray(fold_weights) # store filter weights for later analysis np.save(fold_weights_filename, fold_weights) weights = fold_weights.mean(axis=0) linear.parameters[0].set_value(weights) return model, predict
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32') layers = [l for l in convnet.layers if isinstance(l, Convolutional)] mnist_test = MNIST(("test",), sources=['features', 'targets']) basis_init = create_fair_basis(mnist_test, 10, 50) basis_set = make_shifted_basis(basis_init, convnet, layers) for layer, basis in zip(layers, basis_set): # basis is 5d: # (probed_units, base_cases, 1-c, 28-y, 28-x) b = shared_floatx(basis) # coefficients is 2d: # (probed_units, base_cases) coefficients = shared_floatx( numpy.ones(basis.shape[0:2], dtype=theano.config.floatX)) # prod is 5d: (probed_units, base_cases, 1-c, 28-y, 28-x) prod = tensor.shape_padright(coefficients, 3) * b # x is 4d: (probed_units, 1-c, 28-y, 28-x) ux = prod.sum(axis=1) x = tensor.clip(ux / tensor.shape_padright(ux.flatten(ndim=2).max(axis=1), 3), 0, 1) # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter( roles=[OUTPUT], bricks=[layer])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) learning_rate = shared_floatx(0.03, 'learning_rate') # We will try to do all units at once. # unit = shared_floatx(0, 'unit', dtype='int64') # But we are only doing one layer at once. output = outs[0] dims = layer.get_dims(['output'])[0] if isinstance(dims, numbers.Integral): # FC case: output is 2d: (probed_units, units) dims = (dims, ) unitrange = tensor.arange(dims[0]) costvec = -tensor.log( tensor.nnet.softmax(output)[unitrange, unitrage]. flatten()) else: # Conv case: output is 4d: (probed_units, units, y, x) unitrange = tensor.arange(dims[0]) print('dims is', dims) costvec = -tensor.log(tensor.nnet.softmax(output[ unitrange, unitrange, dims[1] // 2, dims[2] // 2]). flatten()) cost = costvec.sum() # grad is dims (probed_units, basis_size) grad = gradient.grad(cost, coefficients) stepc = coefficients # - learning_rate * grad newc = stepc / tensor.shape_padright(stepc.mean(axis=1)) fn = theano.function([], [cost, x], updates=[(coefficients, newc)]) filmstrip = Filmstrip( random_init.shape[-2:], (dims[0], 1), background='red') layer = get_brick(output) learning_rate.set_value(0.1) for index in range(20000): c, result = fn() if index % 1000 == 0: learning_rate.set_value(numpy.cast[theano.config.floatX]( learning_rate.get_value() * 0.8)) print('cost', c) for u in range(dims[0]): filmstrip.set_image((u, 0), result[u,:,:,:]) filmstrip.save(layer.name + '_stroke.jpg') for u in range(dims[0]): filmstrip.set_image((u, 0), result[u,:,:,:]) filmstrip.save(layer.name + '_stroke.jpg')
m = test.model #Load the test image pic = Image.open("image.jpg").resize((256, 256)) pix = np.array(pic.getdata()) / 255.0 pix = pix.reshape((pic.size[0], pic.size[1], 3)) pix = pix.reshape((1, 3, pic.size[0], pic.size[1])) #For each layers, save the output as image file for k in range(6): print "Creation model " + str(k) y1 = ConvolutionalSequence(conv_layers[0:(k + 1) * 3], num_channels, image_size=image_shape, use_bias=False).apply(x) mo = Model(y1) test = mo.set_parameter_values(m.get_parameter_values()) fprop = function(mo.inputs, mo.outputs[0], allow_input_downcast=True) arr = fprop(pix) arr = arr.reshape( (arr.shape[0] * arr.shape[1], arr.shape[2], arr.shape[3])) #Normalize to get an image for i in range(arr.shape[0]): wmin = float(arr[i].min()) wmax = float(arr[i].max()) if wmin and wmax: arr[i] *= (255.0 / float(wmax - wmin)) arr[i] += abs(wmin) * (255.0 / float(wmax - wmin)) #Plot the outputs fig, ax = plt.subplots(nrows=arr.shape[0] / 10, ncols=10,
#Load the file test = load('catsVsDogs256_8_v3.pkl') m = test.model #Load the test image pic = Image.open("image.jpg").resize((256, 256)) pix = np.array(pic.getdata()) / 255.0 pix = pix.reshape((pic.size[0], pic.size[1], 3)) pix = pix.reshape((1, 3 ,pic.size[0], pic.size[1])) #For each layers, save the output as image file for k in range(6): print "Creation model " + str(k) y1 = ConvolutionalSequence(conv_layers[0:(k+1)*3], num_channels, image_size=image_shape, use_bias=False).apply(x) mo = Model(y1) test = mo.set_parameter_values(m.get_parameter_values()) fprop = function(mo.inputs, mo.outputs[0], allow_input_downcast=True) arr = fprop(pix) arr = arr.reshape((arr.shape[0]*arr.shape[1], arr.shape[2], arr.shape[3])) #Normalize to get an image for i in range(arr.shape[0]): wmin = float(arr[i].min()) wmax = float(arr[i].max()) if wmin and wmax: arr[i] *= (255.0/float(wmax-wmin)) arr[i] += abs(wmin)*(255.0/float(wmax-wmin)) #Plot the outputs fig, ax = plt.subplots(nrows=arr.shape[0]/10, ncols=10, sharex=True, sharey=False) for i in xrange(arr.shape[0]): ax[i/10][i%10].imshow(arr[i], cmap='Greys_r')
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(rename(gain_matrix.min(), 'min_gain')) primary_observables.append(rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) from blocks.roles import AUXILIARY l2_cost, = VariableFilter(roles=[AUXILIARY], theano_name='l2_cost_aux')(cost_cg) cost_forward, = VariableFilter(roles=[AUXILIARY], theano_name='costs_forward_aux')(cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables + [l2_cost, cost_forward]), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def main(save_to, hist_file): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) mnist_test = MNIST(("test", ), sources=['features', 'targets']) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs).copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) model = Model([error_rate, confusion]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outs = OrderedDict( (full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(model.variables)) # Load histogram information with open(hist_file, 'rb') as handle: histograms = pickle.load(handle) # Corpora mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Probe the given layer target_layer = '/lenet/mlp/linear_0' next_layer_param = '/lenet/mlp/linear_1.W' sample = extract_sample(outs[target_layer], mnist_test_stream) print('sample shape', sample.shape) # Figure neurons to ablate hist = histograms[('linear_1', 'b')] targets = [i for i in range(hist.shape[1]) if hist[2, i] * hist[7, i] < 0] print('ablating', len(targets), ':', targets) # Now adjust the next layer weights based on the probe param = model.get_parameter_dict()[next_layer_param] print('param shape', param.get_value().shape) new_weights = ablate_inputs(targets, sample, param.get_value(), compensate=False) param.set_value(new_weights) # Evaluation pass evaluator = DatasetEvaluator([error_rate, confusion]) print(evaluator.evaluate(mnist_test_stream))
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout, depth, max_grad, step_method, epsilon, sample, skip, uniform, top): #---------------------------------------------------------------------- datasource = name def shnum(x): """ Convert a positive float into a short tag-usable string E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2 """ return '0' if x <= 0 else '%s%d' % (("%e"%x)[0], -np.floor(np.log10(x))) jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (datasource, depth, dim, mix_dim, int(dropout*10), shnum(learning_rate), batch_size, shnum(epsilon)) if max_length != 600: jobname += '-L%d'%max_length if GRU: jobname += 'g' if max_grad != 5.: jobname += 'G%g'%max_grad if step_method != 'adam': jobname += step_method if skip: jobname += 'D' assert depth > 1 if top: jobname += 'T' assert depth > 1 if uniform > 0.: jobname += 'u%d'%int(uniform*100) if debug: jobname += ".debug" if sample: print("Sampling") else: print("\nRunning experiment %s" % jobname) if old_model_name: print("starting from model %s"%old_model_name) #---------------------------------------------------------------------- transitions = [GatedRecurrent(dim=dim) if GRU else LSTM(dim=dim) for _ in range(depth)] if depth > 1: transition = RecurrentStack(transitions, name="transition", skip_connections=skip or top) if skip: source_names=[RecurrentStack.suffix('states', d) for d in range(depth)] else: source_names=[RecurrentStack.suffix('states', depth-1)] else: transition = transitions[0] transition.name = "transition" source_names=['states'] emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter") readout = Readout( readout_dim=emitter.get_dim('inputs'), source_names=source_names, emitter=emitter, name="readout") generator = SequenceGenerator(readout=readout, transition=transition) # Initialization settings if uniform > 0.: generator.weights_init = Uniform(width=uniform*2.) else: generator.weights_init = OrthogonalGlorot() generator.biases_init = Constant(0) # Build the cost computation graph [steps, batch_size, 3] x = T.tensor3('features', dtype=floatX) if debug: x.tag.test_value = np.ones((max_length,batch_size,3)).astype(floatX) x = x[:max_length,:,:] # has to be after setting test_value cost = generator.cost(x) cost.name = "sequence_log_likelihood" # Give an idea of what's going on model = Model(cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) model_size = 0 for v in params.itervalues(): s = v.get_value().shape model_size += s[0] * (s[1] if len(s) > 1 else 1) logger.info("Total number of parameters %d"%model_size) #------------------------------------------------------------ extensions = [] if old_model_name: if old_model_name == 'continue': old_model_name = jobname with open(old_model_name + '_model', "rb") as f: old_model = pickle.load(f) model.set_parameter_values(old_model.get_parameter_values()) del old_model else: # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() if sample: assert old_model_name and old_model_name != 'continue' Sample(generator, steps=max_length, path=old_model_name).do(None) exit(0) #------------------------------------------------------------ # Define the training algorithm. cg = ComputationGraph(cost) if dropout > 0.: from blocks.roles import INPUT, OUTPUT dropout_target = VariableFilter(roles=[OUTPUT], bricks=transitions, name_regex='states')(cg.variables) print('# dropout %d' % len(dropout_target)) cg = apply_dropout(cg, dropout_target, dropout) opt_cost = cg.outputs[0] else: opt_cost = cost if step_method == 'adam': step_rule = Adam(learning_rate) elif step_method == 'rmsprop': step_rule = RMSProp(learning_rate, decay_rate=0.95) elif step_method == 'adagrad': step_rule = AdaGrad(learning_rate) elif step_method == 'adadelta': step_rule = AdaDelta() elif step_method == 'scale': step_rule = Scale(learning_rate) else: raise Exception('Unknown sttep method %s'%step_method) step_rule = CompositeRule([StepClipping(max_grad), step_rule]) algorithm = GradientDescent( cost=opt_cost, parameters=cg.parameters, step_rule=step_rule) #------------------------------------------------------------ observables = [cost] # Fetch variables useful for debugging (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") observables += [min_energy, max_energy] # (activations,) = VariableFilter( # applications=[generator.transition.apply], # name=generator.transition.apply.states[0])(cg.variables) # mean_activation = named_copy(abs(activations).mean(), # "mean_activation") # observables.append(mean_activation) observables += [algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(param.norm(2).copy( name=name + "_norm")) observables.append(algorithm.gradients[param].norm(2).copy( name=name + "_grad_norm")) #------------------------------------------------------------ datasource_fname = os.path.join(fuel.config.data_path[0], datasource, datasource+'.hdf5') train_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_sets=['train'], sources=('features',), load_in_memory=True) train_stream = DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size)) test_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_sets=['test'], sources=('features',), load_in_memory=True) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) train_stream = Mapping(train_stream, _transpose) test_stream = Mapping(test_stream, _transpose) def stream_stats(ds, label): itr = ds.get_epoch_iterator(as_dict=True) batch_count = 0 examples_count = 0 for batch in itr: batch_count += 1 examples_count += batch['features'].shape[1] print('%s #batch %d #examples %d' % (label, batch_count, examples_count)) stream_stats(train_stream, 'train') stream_stats(test_stream, 'test') extensions += [Timing(every_n_batches=10), TrainingDataMonitoring( observables, prefix="train", every_n_batches=10), DataStreamMonitoring( [cost], # without dropout test_stream, prefix="test", on_resumption=True, after_epoch=False, # by default this is True every_n_batches=100), # all monitored data is ready so print it... # (next steps may take more time and we want to see the # results as soon as possible so print as soon as you can) Printing(every_n_batches=10), # perform multiple dumps at different intervals # so if one of them breaks (has nan) we can hopefully # find a model from few batches ago in the other Checkpoint(jobname, before_training=False, after_epoch=True, save_separately=['log', 'model']), Sample(generator, steps=max_length, path=jobname+'.test', every_n_batches=100), ProgressBar(), FinishAfter(after_n_epochs=epochs) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), ] if bokeh: from blocks.extras.extensions.plot import Plot extensions.append(Plot( 'sketch', channels=[['cost']], every_n_batches=10)) # Construct the main loop and start training! main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph( batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter( applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter( applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append( rename(gain_matrix.min(), 'min_gain')) primary_observables.append( rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")( cost_cg)[-1] attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model(regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')) .add_condition(["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains( labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] extensions.append(Patience(**patience_conf)) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def evaluate(c, tar_path, *args, **kwargs): """ Performs rudimentary evaluation of SNLI/MNLI run * Runs on valid and test given network * Saves all predictions * Saves embedding matrix * Saves results.json and predictions.csv """ # Load and configure model = kwargs['model'] assert c.endswith("json") c = json.load(open(c)) # Very ugly absolute path fix ABS_PATHS = [ "data/", "/mnt/users/jastrzebski/local/dict_based_learning/data/", "/data/cf9ffb48-61bd-40dc-a011-b2e7e5acfd72/" ] from six import string_types for abs_path in ABS_PATHS: for k in c: if isinstance(c[k], string_types): if c[k].startswith(abs_path): c[k] = c[k][len(abs_path):] # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) logging.info("Updating config with " + str(kwargs)) c.update(**kwargs) # NOTE: This assures we don't miss crucial definition for some def heavy words # usually it is a good idea c['max_def_per_word'] = c['max_def_per_word'] * 2 assert tar_path.endswith("tar") dest_path = os.path.dirname(tar_path) prefix = os.path.splitext(os.path.basename(tar_path))[0] s1_decoded, s2_decoded = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') if model == 'simple': model, data, used_dict, used_retrieval, used_vocab = _initialize_simple_model_and_data( c) elif model == 'esim': model, data, used_dict, used_retrieval, used_vocab = _initialize_esim_model_and_data( c) else: raise NotImplementedError() pred = model.apply(s1_decoded, s1_mask, s2_decoded, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=False) cg = ComputationGraph([pred]) if c.get("bn", True): bn_params = [ p for p in VariableFilter(bricks=[BatchNormalization])(cg) if hasattr(p, "set_value") ] else: bn_params = [] # Load model model = Model(cg.outputs) parameters = model.get_parameter_dict() # Blocks version mismatch logging.info( "Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ])], width=120)) logging.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ]) ]))) with open(tar_path) as src: params = load_parameters(src) loaded_params_set = set(params.keys()) model_params_set = set([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ]) logging.info("Loaded extra parameters") logging.info(loaded_params_set - model_params_set) logging.info("Missing parameters") logging.info(model_params_set - loaded_params_set) model.set_parameter_values(params) if c.get("bn", True): logging.info("Loading " + str([ get_brick(param).get_hierarchical_name(param) for param in bn_params ])) for param in bn_params: param.set_value( params[get_brick(param).get_hierarchical_name(param)]) for p in bn_params: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p # Read logs logs = pd.read_csv(os.path.join(dest_path, "logs.csv")) best_val_acc = logs['valid_misclassificationrate_apply_error_rate'].min() logging.info("Best measured valid acc: " + str(best_val_acc)) # NOTE(kudkudak): We need this to have comparable mean rank and embedding scores reference_vocab = Vocabulary( os.path.join(fuel.config.data_path[0], c['data_path'], 'vocab.txt')) vocab_all = Vocabulary( os.path.join( fuel.config.data_path[0], c['data_path'], 'vocab_all.txt')) # Can include OOV words, which is interesting retrieval_all = Retrieval(vocab_text=used_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) # logging.info("Calculating dict and word embeddings for vocab.txt and vocab_all.txt") # for name in ['s1_word_embeddings', 's1_dict_word_embeddings']: # variables = VariableFilter(name=name)(cg) # if len(variables): # s1_emb = variables[0] # # A bit sloppy about downcast # # if "dict" in name: # embedder = construct_dict_embedder( # theano.function([s1_decoded, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), # vocab=data.vocab, retrieval=retrieval_all) # else: # embedder = construct_embedder(theano.function([s1_decoded], s1_emb, allow_input_downcast=True), # vocab=data.vocab) # # for v_name, v in [("vocab_all", vocab_all), ("vocab", reference_vocab)]: # logging.info("Calculating {} embeddings for {}".format(name, v_name)) # Predict predict_fnc = theano.function(cg.inputs, pred) results = {} batch_size = 14 for subset in ['valid', 'test']: logging.info("Predicting on " + subset) stream = data.get_stream(subset, batch_size=batch_size, seed=778) it = stream.get_epoch_iterator() rows = [] for ex in tqdm.tqdm(it, total=10000 / batch_size): ex = dict(zip(stream.sources, ex)) inp = [ex[v.name] for v in cg.inputs] prob = predict_fnc(*inp) label_pred = np.argmax(prob, axis=1) for id in range(len(prob)): s1_decoded = used_vocab.decode(ex['sentence1'][id]).split() s2_decoded = used_vocab.decode(ex['sentence2'][id]).split() assert used_vocab == data.vocab s1_decoded = [ '*' + w + '*' if used_vocab.word_to_id(w) > c['num_input_words'] else w for w in s1_decoded ] s2_decoded = [ '*' + w + '*' if used_vocab.word_to_id(w) > c['num_input_words'] else w for w in s2_decoded ] # Different difficulty metrics # text_unk_percentage s1_no_pad = [w for w in ex['sentence1'][id] if w != 0] s2_no_pad = [w for w in ex['sentence2'][id] if w != 0] s1_unk_percentage = sum([ 1. for w in s1_no_pad if w == used_vocab.unk ]) / len(s1_no_pad) s2_unk_percentage = sum([ 1. for w in s1_no_pad if w == used_vocab.unk ]) / len(s2_no_pad) # mean freq word s1_mean_freq = np.mean([ 0 if w == data.vocab.unk else used_vocab._id_to_freq[w] for w in s1_no_pad ]) s2_mean_freq = np.mean([ 0 if w == data.vocab.unk else used_vocab._id_to_freq[w] for w in s2_no_pad ]) # mean rank word (UNK is max rank) # NOTE(kudkudak): Will break if we reindex unk between vocabs :P s1_mean_rank = np.mean([ reference_vocab.size() if reference_vocab.word_to_id( used_vocab.id_to_word(w)) == reference_vocab.unk else reference_vocab.word_to_id(used_vocab.id_to_word(w)) for w in s1_no_pad ]) s2_mean_rank = np.mean([ reference_vocab.size() if reference_vocab.word_to_id( used_vocab.id_to_word(w)) == reference_vocab.unk else reference_vocab.word_to_id(used_vocab.id_to_word(w)) for w in s2_no_pad ]) rows.append({ "pred": label_pred[id], "true_label": ex['label'][id], "s1": ' '.join(s1_decoded), "s2": ' '.join(s2_decoded), "s1_unk_percentage": s1_unk_percentage, "s2_unk_percentage": s2_unk_percentage, "s1_mean_freq": s1_mean_freq, "s2_mean_freq": s2_mean_freq, "s1_mean_rank": s1_mean_rank, "s2_mean_rank": s2_mean_rank, "p_0": prob[id, 0], "p_1": prob[id, 1], "p_2": prob[id, 2] }) preds = pd.DataFrame(rows, columns=rows[0].keys()) preds.to_csv( os.path.join(dest_path, prefix + '_predictions_{}.csv'.format(subset))) results[subset] = {} results[subset]['misclassification'] = 1 - np.mean( preds.pred == preds.true_label) if subset == "valid" and np.abs( (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc) > 0.001: logging.error("!!!") logging.error( "Found different best_val_acc. Probably due to changed specification of the model class." ) logging.error("Discrepancy {}".format( (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc)) logging.error("!!!") logging.info(results) json.dump(results, open(os.path.join(dest_path, prefix + '_results.json'), "w"))
def main(save_to): batch_size = 500 image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction convnet = create_lenet_5() mnist_test = MNIST(("test",), sources=['features', 'targets']) basis_init = create_fair_basis(mnist_test, 10, 2) # b = shared_floatx(basis) # random_init = numpy.rand.random(100, 1000) # r = shared_floatx(random_init) # rn = r / r.norm(axis=1) # x = tensor.dot(rn, tensor.shape_padright(b)) x = shared_floatx(basis_init) # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) learning_rate = shared_floatx(0.01, 'learning_rate') unit = shared_floatx(0, 'unit', dtype='int64') negate = False suffix = '_negsynth.jpg' if negate else '_synth.jpg' for output in outs: layer = get_brick(output) # For now, skip masks -for some reason they are always NaN iterations = 10000 layername = layer.parents[0].name + '-' + layer.name # if layername != 'noisylinear_2-linear': # continue dims = layer.get_dims(['output'])[0] if negate: measure = -output else: measure = output measure = measure[(slice(0, basis_init.shape[0]), ) + (slice(None),) * (measure.ndim - 1)] if isinstance(dims, numbers.Integral): dims = (dims, ) costvec = -tensor.log(tensor.nnet.softmax( measure)[:,unit].flatten()) else: flatout = measure.flatten(ndim=3) maxout = flatout.max(axis=2) costvec = -tensor.log(tensor.nnet.softmax( maxout)[:,unit].flatten()) # Add a regularization to favor gray images. # cost = costvec.sum() + (x - 0.5).norm(2) * ( # 10.0 / basis_init.shape[0]) cost = costvec.sum() grad = gradient.grad(cost, x) stepx = x - learning_rate * grad normx = stepx / tensor.shape_padright( stepx.flatten(ndim=2).max(axis=1), n_ones=3) newx = tensor.clip(normx, 0, 1) newx = newx[(slice(0, basis_init.shape[0]), ) + (slice(None),) * (newx.ndim - 1)] fn = theano.function([], [cost], updates=[(x, newx)]) filmstrip = Filmstrip( basis_init.shape[-2:], (dims[0], basis_init.shape[0]), background='red') for u in range(dims[0]): unit.set_value(u) x.set_value(basis_init) print('layer', layername, 'unit', u) for index in range(iterations): c = fn()[0] if index % 1000 == 0: print('cost', c) result = x.get_value() for i2 in range(basis_init.shape[0]): filmstrip.set_image((u, i2), result[i2,:,:,:]) filmstrip.save(layername + suffix) result = x.get_value() for index in range(basis_init.shape[0]): filmstrip.set_image((u, index), result[index,:,:,:]) filmstrip.save(layername + suffix)
def main(argv): name = argv[1] files = map(lambda p: join(folder, p), listdir(folder)) file = next(filter(lambda n: name in n, files)) print(file) p = load_parameter_values(file) net = net_dvc((128, 128)) x = tensor.tensor4('image_features') y_hat = net.apply(x) g = Model(y_hat) for k, v in p.items(): p[k] = v.astype('float32') g.set_parameter_values(p) a, t, v = get_dvc((128, 128), trainning=False, shortcut=False) run = function([x], y_hat) def run_test(data): res = [] for i in data.get_epoch_iterator(): res.extend(run(i[0])) return res def max_index(l): if l[0] > l[1]: return 0 else: return 1 def write_kaggle(f, l): f.write("id,label\n") for i, e in enumerate(l, start=1): f.write(str(i) + "," + str(e) + "\n") def kaggle(file, data): write_kaggle(file, map(max_index, run_test(data))) def accuracy(data): res = [] true = [] for i in data.get_epoch_iterator(): res.extend(run(i[0])) true.extend(i[1]) res = map(max_index, res) total = 0 equal = 0 for r, t in zip(res, true): total += 1 equal += 1 if r == t else 0 return equal / total print("Training accuracy: ", accuracy(a)) print("Test accuracy: ", accuracy(v)) kaggle_file = join(result_folder, name + ".kaggle") print(kaggle_file) with open(kaggle_file, 'w') as f: kaggle(f, t)
def evaluate_lm(config, tar_path, part, num_examples, dest_path, **kwargs): c = config if part not in ['valid', 'test_unseen', 'test']: raise ValueError() data, lm, _ = initialize_data_and_model(c) words = T.ltensor3('words') words_mask = T.matrix('words_mask') costs = lm.apply(words, words_mask) cg = Model(costs) with open(tar_path) as src: cg.set_parameter_values(load_parameters(src)) perplexities = VariableFilter(name_regex='perplexity.*')(cg) mask_sums = [p.tag.aggregation_scheme.denominator for p in perplexities] CEs = [p.tag.aggregation_scheme.numerator for p in perplexities] proba_out, = VariableFilter(name='proba_out')(cg) unk_ratios = VariableFilter(name_regex='unk_ratio.*')(cg) #num_definitions, = VariableFilter(name='num_definitions')(cg) print perplexities print CEs print mask_sums name_to_aggregate = [p.name for p in perplexities] for CE, mask_sum, name in zip(CEs, mask_sums, name_to_aggregate): CE.name = name + "_num" mask_sum.name = name + "_denom" compute_l = CEs + mask_sums + unk_ratios if part == 'test_unseen': compute_l.append(proba_out) compute = dict({p.name: p for p in compute_l}) print "to compute:", compute.keys() predict_f = theano.function([words, words_mask], compute) if part == 'test_unseen': batch_size = 1 else: batch_size = 128 # size of test_unseen stream = data.get_stream(part, batch_size=batch_size, max_length=100) raw_data = [] # list of dicts containing the inputs and computed outputs i = 0 print "start computing" for input_data in stream.get_epoch_iterator(as_dict=True): if i and i % 100 == 0: print "iteration:", i words = input_data['words'] words_mask = input_data['words_mask'] to_save = predict_f(words, words_mask) to_save.update(input_data) raw_data.append(to_save) i += 1 # aggregate in the log space aggregated = Counter() sum_mask_track = Counter() for d in raw_data: coef = d['words_mask'].sum() # over timesteps and batches for name in name_to_aggregate: aggregated[name] += d[name + "_num"] sum_mask_track[name] += d[name + "_denom"] for k, v in aggregated.iteritems(): print "k, v, m:", k, v, sum_mask_track[k] aggregated[k] = np.exp(v / sum_mask_track[k]) n_params = sum([np.prod(p.shape.eval()) for p in cg.parameters]) aggregated['n_params'] = n_params print "aggregated stats:", aggregated print "# of parameters {}".format(n_params) #TODO: check that different batch_size yields same validation error than # end of training validation error. # TODO: I think blocks aggreg is simply mean which should break # when we use masks??? investigate if not os.path.exists(dest_path): os.makedirs(dest_path) if part == 'test_unseen': np.savez( os.path.join(dest_path, "predictions"), words=input_data['words'], words_mask=input_data['words_mask'], #unk_ratio = to_save['unk_ratio'], #def_unk_ratio = to_save['def_unk_ratio'], proba_out=to_save['languagemodel_apply_proba_out'], vocab_in=lm._vocab.words[:c['num_input_words']], vocab_out=lm._vocab.words[:c['num_output_words']]) json.dump(aggregated, open(os.path.join(dest_path, "aggregates.json"), "w"), sort_keys=True, indent=2)
########### GET THE DATA ##################### stream_train = ServerDataStream(('image_features','targets'), False, port=5652, hwm=50) stream_valid = ServerDataStream(('image_features','targets'), False, port=5653, hwm=50) ########### DEFINE THE ALGORITHM ############# track_cost = TrackTheBest("cost", after_epoch=True, after_batch=False) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum(learning_rate=0.0001, momentum=0.9)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), TrainingDataMonitoring([cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid", after_epoch=True), Checkpoint("google_Ortho2_pretrain2_l0001.pkl", after_epoch=True), ProgressBar(), Printing()] #Adding a live plot with the bokeh server extensions.append(Plot( 'CatsVsDogs160_GoogleNet_Reload2_l0001', channels=[['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], ['train_total_gradient_norm']], after_batch=True)) params = load_parameter_values('GoogleParameters.pkl') model = Model(cost) model.set_parameter_values(params) main_loop = MainLoop(algorithm,data_stream=stream_train,model=model,extensions=extensions) main_loop.run()
def main(save_to, hist_file): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) mnist_test = MNIST(("test",), sources=['features', 'targets']) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs) .copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) model = Model([error_rate, confusion]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outs = OrderedDict((full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])( model.variables)) # Load histogram information with open(hist_file, 'rb') as handle: histograms = pickle.load(handle) # Corpora mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Probe the given layer target_layer = '/lenet/mlp/linear_0' next_layer_param = '/lenet/mlp/linear_1.W' sample = extract_sample(outs[target_layer], mnist_test_stream) print('sample shape', sample.shape) # Figure neurons to ablate hist = histograms[('linear_1', 'b')] targets = [i for i in range(hist.shape[1]) if hist[2, i] * hist[7, i] < 0] print('ablating', len(targets), ':', targets) # Now adjust the next layer weights based on the probe param = model.get_parameter_dict()[next_layer_param] print('param shape', param.get_value().shape) new_weights = ablate_inputs( targets, sample, param.get_value(), compensate=False) param.set_value(new_weights) # Evaluation pass evaluator = DatasetEvaluator([error_rate, confusion]) print(evaluator.evaluate(mnist_test_stream))
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() layers = convnet.layers logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32') mnist_test = MNIST(("test",), sources=['features', 'targets']) basis = create_fair_basis(mnist_test, 10, 10) # state = mnist_test.open() # # basis = numpy.zeros((100, 1, 28, 28), dtype=theano.config.floatX) # counters = [0] * 10 # index = 0 # while min(counters) < 10: # feature, target = mnist_test.get_data(state=state, request=[index]) # target = target[0, 0] # feature = feature / 256 # if counters[target] < 10: # basis[target + counters[target] * 10, :, :, :] = feature[0, :, :, :] # counters[target] += 1 # index += 1 # mnist_test.close(state=state) # b = shared_floatx(basis) # random_init = numpy.rand.random(100, 1000) # r = shared_floatx(random_init) # rn = r / r.norm(axis=1) # x = tensor.dot(rn, tensor.shape_padright(b)) x = tensor.tensor4('features') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) fn = theano.function([x], outs) results = fn(basis) for snapshots, output in zip(results, outs): layer = get_brick(output) filmstrip = Filmstrip( basis.shape[-2:], (snapshots.shape[1], snapshots.shape[0]), background='purple') if layer in layers: fieldmap = layerarray_fieldmap(layers[0:layers.index(layer) + 1]) for unit in range(snapshots.shape[1]): for index in range(snapshots.shape[0]): mask = make_mask(basis.shape[-2:], fieldmap, numpy.clip( snapshots[index, unit, :, :], 0, numpy.inf)) filmstrip.set_image((unit, index), basis[index, :, :, :], mask) filmstrip.save(layer.name + '_show.jpg')
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32') layers = [l for l in convnet.layers if isinstance(l, Convolutional)] mnist_test = MNIST(("test", ), sources=['features', 'targets']) basis_init = create_fair_basis(mnist_test, 10, 50) basis_set = make_shifted_basis(basis_init, convnet, layers) for layer, basis in zip(layers, basis_set): # basis is 5d: # (probed_units, base_cases, 1-c, 28-y, 28-x) b = shared_floatx(basis) # coefficients is 2d: # (probed_units, base_cases) coefficients = shared_floatx( numpy.ones(basis.shape[0:2], dtype=theano.config.floatX)) # prod is 5d: (probed_units, base_cases, 1-c, 28-y, 28-x) prod = tensor.shape_padright(coefficients, 3) * b # x is 4d: (probed_units, 1-c, 28-y, 28-x) ux = prod.sum(axis=1) x = tensor.clip( ux / tensor.shape_padright(ux.flatten(ndim=2).max(axis=1), 3), 0, 1) # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter(roles=[OUTPUT], bricks=[layer])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) learning_rate = shared_floatx(0.03, 'learning_rate') # We will try to do all units at once. # unit = shared_floatx(0, 'unit', dtype='int64') # But we are only doing one layer at once. output = outs[0] dims = layer.get_dims(['output'])[0] if isinstance(dims, numbers.Integral): # FC case: output is 2d: (probed_units, units) dims = (dims, ) unitrange = tensor.arange(dims[0]) costvec = -tensor.log( tensor.nnet.softmax(output)[unitrange, unitrage].flatten()) else: # Conv case: output is 4d: (probed_units, units, y, x) unitrange = tensor.arange(dims[0]) print('dims is', dims) costvec = -tensor.log( tensor.nnet.softmax(output[unitrange, unitrange, dims[1] // 2, dims[2] // 2]).flatten()) cost = costvec.sum() # grad is dims (probed_units, basis_size) grad = gradient.grad(cost, coefficients) stepc = coefficients # - learning_rate * grad newc = stepc / tensor.shape_padright(stepc.mean(axis=1)) fn = theano.function([], [cost, x], updates=[(coefficients, newc)]) filmstrip = Filmstrip(random_init.shape[-2:], (dims[0], 1), background='red') layer = get_brick(output) learning_rate.set_value(0.1) for index in range(20000): c, result = fn() if index % 1000 == 0: learning_rate.set_value(numpy.cast[theano.config.floatX]( learning_rate.get_value() * 0.8)) print('cost', c) for u in range(dims[0]): filmstrip.set_image((u, 0), result[u, :, :, :]) filmstrip.save(layer.name + '_stroke.jpg') for u in range(dims[0]): filmstrip.set_image((u, 0), result[u, :, :, :]) filmstrip.save(layer.name + '_stroke.jpg')
def __init__(self, save_to): batch_size = 500 image_size = (28, 28) output_size = 10 convnet = create_lenet_5() layers = convnet.layers logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) mnist_test = MNIST(("test",), sources=['features', 'targets']) basis = create_fair_basis(mnist_test, 10, 10) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outs = OrderedDict((full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])( cg.variables)) # Normalize input and apply the convnet error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs) .copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) confusion_image = (ConfusionImage().apply(y.flatten(), probs, x) .copy(name='confusion_image')) confusion_image.tag.aggregation_scheme = Sum(confusion_image) model = Model( [error_rate, confusion, confusion_image] + list(outs.values())) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size)) self.model = model self.mnist_test_stream = mnist_test_stream self.evaluator = DatasetEvaluator( [error_rate, confusion, confusion_image]) self.base_results = self.evaluator.evaluate(mnist_test_stream) # TODO: allow target layer to be parameterized self.target_layer = '/lenet/mlp/linear_0' self.next_layer_param = '/lenet/mlp/linear_1.W' self.base_sample = extract_sample( outs[self.target_layer], mnist_test_stream) self.base_param_value = ( model.get_parameter_dict()[ self.next_layer_param].get_value().copy())
def main(argv): name = argv[1] files = map(lambda p: join(folder, p), listdir(folder)) file = next(filter(lambda n: name in n, files)) print(file) p = load_parameter_values(file) net = net_dvc((128,128)) x = tensor.tensor4('image_features') y_hat = net.apply(x) g = Model(y_hat) for k,v in p.items(): p[k] = v.astype('float32') g.set_parameter_values(p) a,t,v = get_dvc((128,128),trainning=False, shortcut=False) run = function([x], y_hat) def run_test(data): res = [] for i in data.get_epoch_iterator(): res.extend(run(i[0])) return res def max_index(l): if l[0] > l[1]: return 0 else: return 1 def write_kaggle(f, l): f.write("id,label\n") for i,e in enumerate(l,start=1): f.write(str(i)+","+str(e)+"\n") def kaggle(file, data): write_kaggle(file,map(max_index, run_test(data))) def accuracy(data): res = [] true = [] for i in data.get_epoch_iterator(): res.extend(run(i[0])) true.extend(i[1]) res = map(max_index, res) total = 0 equal = 0 for r,t in zip(res,true): total += 1 equal += 1 if r == t else 0 return equal/total print("Training accuracy: ", accuracy(a)) print("Test accuracy: ", accuracy(v)) kaggle_file = join(result_folder, name+".kaggle") print(kaggle_file) with open(kaggle_file,'w') as f: kaggle(f, t)
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('features') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) algorithm = MaximumActivationSearch(outputs=outs) # Use the mnist test set, unshuffled mnist_test = MNIST(("test",), sources=['features']) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size)) extensions = [Timing(), FinishAfter(after_n_epochs=1), DataStreamMonitoring( [], mnist_test_stream, prefix="test"), Checkpoint("maxact.tar"), ProgressBar(), Printing()] main_loop = MainLoop( algorithm, mnist_test_stream, model=model, extensions=extensions) main_loop.run() examples = mnist_test.get_example_stream() example = examples.get_data(0)[0] layers = convnet.layers for output, record in algorithm.maximum_activations.items(): layer = get_brick(output) activations, indices, snapshots = ( r.get_value() if r else None for r in record[1:]) filmstrip = Filmstrip( example.shape[-2:], (indices.shape[1], indices.shape[0]), background='blue') if layer in layers: fieldmap = layerarray_fieldmap(layers[0:layers.index(layer) + 1]) for unit in range(indices.shape[1]): for index in range(100): mask = make_mask(example.shape[-2:], fieldmap, numpy.clip( snapshots[index, unit, :, :], 0, numpy.inf)) imagenum = indices[index, unit, 0] filmstrip.set_image((unit, index), examples.get_data(imagenum)[0], mask) else: for unit in range(indices.shape[1]): for index in range(100): imagenum = indices[index, unit] filmstrip.set_image((unit, index), examples.get_data(imagenum)[0]) filmstrip.save(layer.name + '_maxact.jpg')
convnet.layers[1].weights_init = Uniform(width=0.2) convnet.layers[2].weights_init = Uniform(width=0.2) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=0.2) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=0.2) convnet.initialize() ''' ######################################################### #Generate output and error signal predict = convnet.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') cg = ComputationGraph(cost) #Load the parameters of the model params = load_parameter_values('catsVsDogs256.pkl') mo = Model(predict) mo.set_parameter_values(params) print mo.inputs print dir(mo.inputs) print mo.outputs f = theano.function(mo.inputs, mo.outputs, allow_input_downcast=True) predictions = [] k = 0 for batch in stream_test.get_epoch_iterator(): example = numpy.array([batch[0]]) batch_predictions = f(example) #batch_predictions=batch_predictions[0] #get the array for result in batch_predictions: res = numpy.argmax(result) predictions.append(res) k += 1
def run_visualizations(cost, updates, train_stream, valid_stream, args, hidden_states=None, gate_values=None): # Load the parameters from a dumped model assert args.load_path is not None param_values = load_parameter_values(args.load_path) if args.hide_all_except is not None: i = args.hide_all_except sdim = args.state_dim output_size = get_output_size(args.dataset) hidden = np.zeros((args.layers * sdim, output_size), dtype=np.float32) output_w = param_values["/output_layer.W"] hidden[i * sdim:(i + 1) * sdim, :] = output_w[i * sdim:(i + 1) * sdim, :] param_values["/output_layer.W"] = hidden model = Model(cost) model.set_parameter_values(param_values) # Run a visualization if args.visualize == "generate": visualize_generate(cost, hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "gates" and (gate_values is not None): if args.rnn_type == "lstm": visualize_gates_lstm(gate_values, hidden_states, updates, train_stream, valid_stream, args) elif args.rnn_type == "soft": visualize_gates_soft(gate_values, hidden_states, updates, train_stream, valid_stream, args) else: assert False elif args.visualize == "states": visualize_states(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "gradients": visualize_gradients(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "jacobian": visualize_jacobian(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "presoft": visualize_presoft(cost, hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "matrices": visualize_matrices(args) elif args.visualize == "trained_singular_values": visualize_singular_values(args) elif args.visualize == "gradients_flow_pie": visualize_gradients_flow_pie(hidden_states, updates, args) else: assert False
class VGGClassifier(object): def __init__(self, model_path='vgg.tar', synset_words='synset_words.txt'): self.vgg_net = VGGNet() x = theano.tensor.tensor4('x') y_hat = self.vgg_net.apply(x) cg = ComputationGraph(y_hat) self.model = Model(y_hat) with open(model_path, 'rb') as f: self.model.set_parameter_values(load_parameters(f)) with open(synset_words) as f: self.classes = numpy.array(f.read().splitlines()) self.predict = cg.get_theano_function() fc15 = VariableFilter( theano_name_regex='fc_15_apply_output')(cg.variables)[0] self.fe_extractor = ComputationGraph(fc15).get_theano_function() def classify(self, image, top=1): """Classify an image with the 1000 concepts of the ImageNet dataset. :image: numpy image or image path. :top: Number of top classes for this image. :returns: list of strings with synsets predicted by the VGG model. """ if type(image) == str: image = VGGClassifier.resize_and_crop_image(image) idx = self.predict(image)[0].flatten().argsort() top = idx[::-1][:top] return self.classes[top] def get_features(self, image): """Returns the activations of the last hidden layer for a given image. :image: numpy image or image path. :returns: numpy vector with 4096 activations. """ if type(image) == str: image = VGGClassifier.resize_and_crop_image(image) return self.fe_extractor(image)[0] def resize_and_crop_image(input_file, output_box=[224, 224], fit=True): # https://github.com/BVLC/caffe/blob/master/tools/extra/resize_and_crop_images.py '''Downsample the image. ''' img = Image.open(input_file) box = output_box # preresize image with factor 2, 4, 8 and fast algorithm factor = 1 while img.size[0] / factor > 2 * box[0] and img.size[1] * 2 / factor > 2 * box[1]: factor *= 2 if factor > 1: img.thumbnail( (img.size[0] / factor, img.size[1] / factor), Image.NEAREST) # calculate the cropping box and get the cropped part if fit: x1 = y1 = 0 x2, y2 = img.size wRatio = 1.0 * x2 / box[0] hRatio = 1.0 * y2 / box[1] if hRatio > wRatio: y1 = int(y2 / 2 - box[1] * wRatio / 2) y2 = int(y2 / 2 + box[1] * wRatio / 2) else: x1 = int(x2 / 2 - box[0] * hRatio / 2) x2 = int(x2 / 2 + box[0] * hRatio / 2) img = img.crop((x1, y1, x2, y2)) # Resize the image with best quality algorithm ANTI-ALIAS img = img.resize(box, Image.ANTIALIAS).convert('RGB') img = numpy.asarray(img, dtype='float32')[..., [2, 1, 0]] img[:, :, 0] -= 103.939 img[:, :, 1] -= 116.779 img[:, :, 2] -= 123.68 img = img.transpose((2, 0, 1)) img = numpy.expand_dims(img, axis=0) return img
def train_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed): c = config if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed data, model = initialize_data_and_model(config, train_phase=True) # full main loop can be saved... main_loop_path = os.path.join(save_path, 'main_loop.tar') # or only state (log + params) which can be useful not to pickle embeddings state_path = os.path.join(save_path, 'training_state.tar') stream_path = os.path.join(save_path, 'stream.pkl') best_tar_path = os.path.join(save_path, "best_model.tar") keys = tensor.lmatrix('keys') n_identical_keys = tensor.lvector('n_identical_keys') words = tensor.ltensor3('words') words_mask = tensor.matrix('words_mask') if theano.config.compute_test_value != 'off': #TODO test_value_data = next( data.get_stream('train', batch_size=4, max_length=5).get_epoch_iterator()) words.tag.test_value = test_value_data[0] words_mask.tag.test_value = test_value_data[1] if use_keys(c) and use_n_identical_keys(c): costs = model.apply(words, words_mask, keys, n_identical_keys, train_phase=True) elif use_keys(c): costs = model.apply(words, words_mask, keys, train_phase=True) else: costs = model.apply(words, words_mask, train_phase=True) cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(words.shape[1], 'length') perplexity, = VariableFilter(name='perplexity')(cg) monitored_vars = [length, cost, perplexity] if c['proximity_coef']: proximity_term, = VariableFilter(name='proximity_term')(cg) monitored_vars.append(proximity_term) print "inputs of the model:", cg.inputs parameters = cg.get_parameter_dict() trained_parameters = parameters.values() saved_parameters = parameters.values() if c['embedding_path']: if c['freeze_pretrained']: logger.debug( "Exclude pretrained encoder embeddings from the trained parameters" ) to_freeze = 'main' elif c['provide_targets']: logger.debug( "Exclude pretrained targets from the trained parameters") to_freeze = 'target' trained_parameters = [ p for p in trained_parameters if not p == model.get_def_embeddings_params(to_freeze) ] saved_parameters = [ p for p in saved_parameters if not p == model.get_def_embeddings_params(to_freeze) ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) train_monitored_vars = list(monitored_vars) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) # We use a completely random seed on purpose. With Fuel server # it's currently not possible to restore the state of the training # stream. That's why it's probably better to just have it stateless. stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None training_stream = data.get_stream( 'train', batch_size=c['batch_size'], max_length=c['max_length'], seed=stream_seed, remove_keys=not use_keys(c), remove_n_identical_keys=not use_n_identical_keys(c)) print "trainin_stream will contains sources:", training_stream.sources original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) validate = c['mon_freq_valid'] > 0 if validate: valid_stream = data.get_stream( 'valid', batch_size=c['batch_size_valid'], max_length=c['max_length'], seed=stream_seed, remove_keys=not use_keys(c), remove_n_identical_keys=not use_n_identical_keys(c)) validation = DataStreamMonitoring( monitored_vars, valid_stream, prefix="valid").set_conditions(before_first_epoch=not fast_start, on_resumption=True, every_n_batches=c['mon_freq_valid']) track_the_best = TrackTheBest(validation.record_name(cost), choose_best=min).set_conditions( on_resumption=True, after_epoch=True, every_n_batches=c['mon_freq_valid']) # don't save them the entire main loop to avoid pickling everything if c['fast_checkpoint']: cp_path = state_path load = (LoadNoUnpickling(cp_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_main_loop': False, 'save_separately': ['log', 'iteration_state'], 'parameters': saved_parameters } else: cp_path = main_loop_path load = (Load(cp_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_separately': ['iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(cp_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches'] > 0 or c[ 'checkpoint_every_n_epochs'] > 0: intermediate_cp = IntermediateCheckpoint( cp_path, every_n_epochs=c['checkpoint_every_n_epochs'], every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) if validate: checkpoint = checkpoint.add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (best_tar_path, )) extensions = [ load, StartFuelServer(original_training_stream, stream_path, before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']) ] extensions.extend([ TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), ]) if validate: extensions.extend([validation, track_the_best]) extensions.append(checkpoint) if c['checkpoint_every_n_batches'] > 0 or c[ 'checkpoint_every_n_epochs'] > 0: extensions.append(intermediate_cp) extensions.extend( [Printing(on_resumption=True, every_n_batches=c['mon_freq_train'])]) if validate and c['n_valid_early'] > 0: extensions.append( FinishIfNoImprovementAfter(track_the_best.notification_name, iterations=c['n_valid_early'] * c['mon_freq_valid'], every_n_batches=c['mon_freq_valid'])) extensions.append(FinishAfter(after_n_epochs=c['n_epochs'])) logger.info("monitored variables during training:" + "\n" + pprint.pformat(train_monitored_vars, width=120)) logger.info("monitored variables during valid:" + "\n" + pprint.pformat(monitored_vars, width=120)) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
for p in cg.parameters: print(str(p), p.shape, p.dtype) print("Created ComputationGraph, inputs:") print(cg.inputs) # Strangely, all the examples use : DataStreamMonitoring in MainLoop model = Model(labels) print("Model.dict_of_inputs():") print(model.dict_of_inputs()) print("Model list inputs:") print([v.name for v in model.inputs]) ## Model loading from saved file model.set_parameter_values(load_parameter_values(save_state_path)) examine_embedding(lookup.W.get_value()) label_ner = model.get_theano_function() print(model.inputs) print("printed label_ner.params") for test_data in data_stream.get_epoch_iterator(): ordered_batch = test_data[ 0:3] # Explicitly strip off the pre-defined labels #print(ordered_batch) results = label_ner(*ordered_batch) #print(results) # This is a pure array of labels
def train_extractive_qa(new_training_job, config, save_path, params, fast_start, fuel_server, seed): if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed root_path = os.path.join(save_path, 'training_state') extension = '.tar' tar_path = root_path + extension best_tar_path = root_path + '_best' + extension c = config data, qam = initialize_data_and_model(c) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', shuffle=True, batch_size=4, max_length=5).get_epoch_iterator(as_dict=True)) for var in qam.input_vars.values(): var.tag.test_value = test_value_data[var.name] costs = qam.apply_with_default_vars() cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(qam.contexts.shape[1], 'length') batch_size = rename(qam.contexts.shape[0], 'batch_size') predicted_begins, = VariableFilter(name='predicted_begins')(cg) predicted_ends, = VariableFilter(name='predicted_ends')(cg) exact_match, = VariableFilter(name='exact_match')(cg) exact_match_ratio = rename(exact_match.mean(), 'exact_match_ratio') context_unk_ratio, = VariableFilter(name='context_unk_ratio')(cg) monitored_vars = [ length, batch_size, cost, exact_match_ratio, context_unk_ratio ] if c['dict_path']: def_unk_ratio, = VariableFilter(name='def_unk_ratio')(cg) num_definitions = rename(qam.input_vars['defs'].shape[0], 'num_definitions') max_definition_length = rename(qam.input_vars['defs'].shape[1], 'max_definition_length') monitored_vars.extend( [def_unk_ratio, num_definitions, max_definition_length]) if c['def_word_gating'] == 'self_attention': def_gates = VariableFilter(name='def_gates')(cg) def_gates_min = tensor.minimum(*[x.min() for x in def_gates]) def_gates_max = tensor.maximum(*[x.max() for x in def_gates]) monitored_vars.extend([ rename(def_gates_min, 'def_gates_min'), rename(def_gates_max, 'def_gates_max') ]) text_match_ratio = TextMatchRatio(data_path=os.path.join( fuel.config.data_path[0], 'squad/dev-v1.1.json'), requires=[ predicted_begins, predicted_ends, tensor.ltensor3('contexts_text'), tensor.lmatrix('q_ids') ], name='text_match_ratio') parameters = cg.get_parameter_dict() trained_parameters = parameters.values() if c['embedding_path']: logger.debug("Exclude word embeddings from the trained parameters") trained_parameters = [ p for p in trained_parameters if not p == qam.embeddings_var() ] if c['train_only_def_part']: def_reading_parameters = qam.def_reading_parameters() trained_parameters = [ p for p in trained_parameters if p in def_reading_parameters ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) # apply dropout to the training cost and to all the variables # that we monitor during training train_cost = cost train_monitored_vars = list(monitored_vars) if c['dropout']: regularized_cg = ComputationGraph([cost] + train_monitored_vars) # Dima: the dropout that I implemented first bidir_outputs, = VariableFilter(bricks=[Bidirectional], roles=[OUTPUT])(cg) readout_layers = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg) dropout_vars = [bidir_outputs] + readout_layers logger.debug("applying dropout to {}".format(", ".join( [v.name for v in dropout_vars]))) regularized_cg = apply_dropout(regularized_cg, dropout_vars, c['dropout']) # a new dropout with exactly same mask at different steps emb_vars = VariableFilter(roles=[EMBEDDINGS])(regularized_cg) emb_dropout_mask = get_dropout_mask(emb_vars[0], c['emb_dropout']) if c['emb_dropout_type'] == 'same_mask': regularized_cg = apply_dropout2(regularized_cg, emb_vars, c['emb_dropout'], dropout_mask=emb_dropout_mask) elif c['emb_dropout_type'] == 'regular': regularized_cg = apply_dropout(regularized_cg, emb_vars, c['emb_dropout']) else: raise ValueError("unknown dropout type {}".format( c['emb_dropout_type'])) train_cost = regularized_cg.outputs[0] train_monitored_vars = regularized_cg.outputs[1:] rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=train_cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) training_stream = data.get_stream('train', batch_size=c['batch_size'], shuffle=True, max_length=c['max_length']) original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) extensions = [ LoadNoUnpickling(tar_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job), StartFuelServer(original_training_stream, os.path.join(save_path, 'stream.pkl'), before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), ] validation = DataStreamMonitoring( [text_match_ratio] + monitored_vars, data.get_stream('dev', batch_size=c['batch_size_valid'], raw_text=True, q_ids=True), prefix="dev").set_conditions(before_training=not fast_start, after_epoch=True) dump_predictions = DumpPredictions(save_path, text_match_ratio, before_training=not fast_start, after_epoch=True) track_the_best_exact = TrackTheBest( validation.record_name(exact_match_ratio), choose_best=max).set_conditions(before_training=True, after_epoch=True) track_the_best_text = TrackTheBest( validation.record_name(text_match_ratio), choose_best=max).set_conditions(before_training=True, after_epoch=True) extensions.extend([ validation, dump_predictions, track_the_best_exact, track_the_best_text ]) # We often use pretrained word embeddings and we don't want # to load and save them every time. To avoid that, we use # save_main_loop=False, we only save the trained parameters, # and we save the log and the iterations state separately # in the tar file. extensions.extend([ Checkpoint(tar_path, parameters=trained_parameters, save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], every_n_batches=c['save_freq_batches'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best_text.notification_name), (best_tar_path, )), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_train'], after_training=True), RetrievalPrintStats(retrieval=data._retrieval, every_n_batches=c['mon_freq_train'], before_training=not fast_start), Printing(after_epoch=True, every_n_batches=c['mon_freq_train']), FinishAfter(after_n_batches=c['n_batches'], after_n_epochs=c['n_epochs']), Annealing(c['annealing_learning_rate'], after_n_epochs=c['annealing_start_epoch']), LoadNoUnpickling(best_tar_path, after_n_epochs=c['annealing_start_epoch']) ]) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
def train_language_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed): c = config if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed data, lm, retrieval = initialize_data_and_model(config) # full main loop can be saved... main_loop_path = os.path.join(save_path, 'main_loop.tar') # or only state (log + params) which can be useful not to pickle embeddings state_path = os.path.join(save_path, 'training_state.tar') stream_path = os.path.join(save_path, 'stream.pkl') best_tar_path = os.path.join(save_path, "best_model.tar") words = tensor.ltensor3('words') words_mask = tensor.matrix('words_mask') if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4, max_length=5).get_epoch_iterator()) words.tag.test_value = test_value_data[0] words_mask.tag.test_value = test_value_data[1] costs, updates = lm.apply(words, words_mask) cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(words.shape[1], 'length') perplexity, = VariableFilter(name='perplexity')(cg) perplexities = VariableFilter(name_regex='perplexity.*')(cg) monitored_vars = [length, cost] + perplexities if c['dict_path']: num_definitions, = VariableFilter(name='num_definitions')(cg) monitored_vars.extend([num_definitions]) parameters = cg.get_parameter_dict() trained_parameters = parameters.values() saved_parameters = parameters.values() if c['embedding_path']: logger.debug("Exclude word embeddings from the trained parameters") trained_parameters = [ p for p in trained_parameters if not p == lm.get_def_embeddings_params() ] saved_parameters = [ p for p in saved_parameters if not p == lm.get_def_embeddings_params() ] if c['cache_size'] != 0: logger.debug("Enable fake recursivity for looking up embeddings") trained_parameters = [ p for p in trained_parameters if not p == lm.get_cache_params() ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) if c['cache_size'] != 0: algorithm.add_updates(updates) train_monitored_vars = list(monitored_vars) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg) main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg) train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS]) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) # We use a completely random seed on purpose. With Fuel server # it's currently not possible to restore the state of the training # stream. That's why it's probably better to just have it stateless. stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None training_stream = data.get_stream('train', batch_size=c['batch_size'], max_length=c['max_length'], seed=stream_seed) valid_stream = data.get_stream('valid', batch_size=c['batch_size_valid'], max_length=c['max_length'], seed=stream_seed) original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) validation = DataStreamMonitoring(monitored_vars, valid_stream, prefix="valid").set_conditions( before_first_epoch=not fast_start, on_resumption=True, every_n_batches=c['mon_freq_valid']) track_the_best = TrackTheBest(validation.record_name(perplexity), choose_best=min).set_conditions( on_resumption=True, after_epoch=True, every_n_batches=c['mon_freq_valid']) # don't save them the entire main loop to avoid pickling everything if c['fast_checkpoint']: load = (LoadNoUnpickling(state_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_main_loop': False, 'save_separately': ['log', 'iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(state_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches']: intermediate_cp = IntermediateCheckpoint( state_path, every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) else: load = (Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_separately': ['iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(main_loop_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches']: intermediate_cp = IntermediateCheckpoint( main_loop_path, every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) checkpoint = checkpoint.add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (best_tar_path, )) extensions = [ load, StartFuelServer(original_training_stream, stream_path, before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']) ] if retrieval: extensions.append( RetrievalPrintStats(retrieval=retrieval, every_n_batches=c['mon_freq_train'], before_training=not fast_start)) extensions.extend([ TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), validation, track_the_best, checkpoint ]) if c['checkpoint_every_n_batches']: extensions.append(intermediate_cp) extensions.extend([ DumpTensorflowSummaries(save_path, every_n_batches=c['mon_freq_train'], after_training=True), Printing(on_resumption=True, every_n_batches=c['mon_freq_train']), FinishIfNoImprovementAfter(track_the_best.notification_name, iterations=50 * c['mon_freq_valid'], every_n_batches=c['mon_freq_valid']), FinishAfter(after_n_batches=c['n_batches']) ]) logger.info("monitored variables during training:" + "\n" + pprint.pformat(train_monitored_vars, width=120)) logger.info("monitored variables during valid:" + "\n" + pprint.pformat(monitored_vars, width=120)) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
def evaluate_extractive_qa(config, tar_path, part, num_examples, dest_path, qids=None, dataset=None): if not dest_path: dest_path = os.path.join(os.path.dirname(tar_path), 'predictions.json') log_path = os.path.splitext(dest_path)[0] + '_log.json' if qids: qids = qids.split(',') if dataset: dataset = SQuADDataset(dataset, ('all', )) c = config data, qam = initialize_data_and_model(c) costs = qam.apply_with_default_vars() cg = Model(costs) with open(tar_path) as src: cg.set_parameter_values(load_parameters(src)) predicted_begins, = VariableFilter(name='predicted_begins')(cg) predicted_ends, = VariableFilter(name='predicted_ends')(cg) compute = {'begins': predicted_begins, 'ends': predicted_ends} if c['coattention']: d2q_att_weights, = VariableFilter(name='d2q_att_weights')(cg) q2d_att_weights, = VariableFilter(name='q2d_att_weights')(cg) compute.update({'d2q': d2q_att_weights, 'q2d': q2d_att_weights}) compute['costs'] = costs predict_func = theano.function(qam.input_vars.values(), compute) logger.debug("Ready to evaluate") done_examples = 0 num_correct = 0 def print_stats(): print('EXACT MATCH RATIO: {}'.format(num_correct / float(done_examples))) predictions = {} log = {} stream = data.get_stream(part, batch_size=1, shuffle=part == 'train', raw_text=True, q_ids=True, dataset=dataset) for example in stream.get_epoch_iterator(as_dict=True): if done_examples == num_examples: break q_id = vec2str(example['q_ids'][0]) if qids and not q_id in qids: continue example['contexts_text'] = [map(vec2str, example['contexts_text'][0])] example['questions_text'] = [ map(vec2str, example['questions_text'][0]) ] feed = dict(example) del feed['q_ids'] del feed['contexts_text'] del feed['questions_text'] del feed['contexts_text_mask'] result = predict_func(**feed) correct_answer_span = slice(example['answer_begins'][0], example['answer_ends'][0]) predicted_answer_span = slice(result['begins'][0], result['ends'][0]) correct_answer = example['contexts_text'][0][correct_answer_span] answer = example['contexts_text'][0][predicted_answer_span] is_correct = correct_answer_span == predicted_answer_span context = example['contexts_text'][0] question = example['questions_text'][0] context_def_map = example['contexts_def_map'] # pretty print outcome = 'correct' if is_correct else 'wrong' print('#{}'.format(done_examples)) print(u"CONTEXT:", detokenize(context)) print(u"QUESTION:", detokenize(question)) print(u"RIGHT ANSWER: {}".format(detokenize(correct_answer))) print( u"ANSWER (span=[{}, {}], {}):".format(predicted_answer_span.start, predicted_answer_span.stop, outcome), detokenize(answer)) print(u"COST: {}".format(float(result['costs'][0]))) print(u"DEFINITIONS AVAILABLE FOR:") for pos in set(context_def_map[:, 1]): print(context[pos]) print() # update statistics done_examples += 1 num_correct += is_correct # save the results predictions[q_id] = detokenize(answer) log_entry = { 'context': context, 'question': question, 'answer': answer, 'correct_answer': correct_answer, 'cost': float(result['costs'][0]) } if c['coattention']: log_entry['d2q'] = cPickle.dumps(result['d2q'][0]) log_entry['q2d'] = cPickle.dumps(result['q2d'][0]) log[q_id] = log_entry if done_examples % 100 == 0: print_stats() print_stats() with open(log_path, 'w') as dst: json.dump(log, dst, indent=2, sort_keys=True) with open(dest_path, 'w') as dst: json.dump(predictions, dst, indent=2, sort_keys=True)
TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid", after_epoch=True), Checkpoint("google_Ortho2_pretrain2_l0001.pkl", after_epoch=True), ProgressBar(), Printing() ] #Adding a live plot with the bokeh server extensions.append( Plot('CatsVsDogs160_GoogleNet_Reload2_l0001', channels=[['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], ['train_total_gradient_norm']], after_batch=True)) params = load_parameter_values('GoogleParameters.pkl') model = Model(cost) model.set_parameter_values(params) main_loop = MainLoop(algorithm, data_stream=stream_train, model=model, extensions=extensions) main_loop.run()
def main(mode, save_path, steps, num_batches, load_params): chars = (list(string.ascii_uppercase) + list(range(10)) + [' ', '.', ',', '\'', '"', '!', '?', '<UNK>']) char_to_ind = {char: i for i, char in enumerate(chars)} ind_to_char = {v: k for k, v in char_to_ind.iteritems()} train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'], char_to_ind, bos_token=None, eos_token=None, level='character') valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'], char_to_ind, bos_token=None, eos_token=None, level='character') vocab_size = len(char_to_ind) logger.info('Dictionary size: {}'.format(vocab_size)) if mode == 'continue': continue_training(save_path) return elif mode == "sample": main_loop = load(open(save_path, "rb")) generator = main_loop.model.get_top_bricks()[-1] sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] print("".join([ind_to_char[s] for s in outputs])) numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] return # Experiment configuration batch_size = 20 dim = 650 feedback_dim = 650 valid_stream = valid_dataset.get_example_stream() valid_stream = Batch(valid_stream, iteration_scheme=ConstantScheme(batch_size)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=vocab_size, source_names=transition.apply.states, emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=Uniform(std=0.04), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() transition.push_initialization_config() generator.initialize() # Build the cost computation graph. features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') cost_matrix = generator.cost_matrix( features, mask=features_mask) batch_cost = cost_matrix.sum() cost = aggregation.mean( batch_cost, features.shape[1]) cost.name = "sequence_log_likelihood" char_cost = aggregation.mean( batch_cost, features_mask.sum()) char_cost.name = 'character_log_likelihood' ppl = 2 ** (cost / numpy.log(2)) ppl.name = 'ppl' bits_per_char = char_cost / tensor.log(2) bits_per_char.name = 'bits_per_char' length = features.shape[0] length.name = 'length' model = Model(batch_cost) if load_params: params = load_parameter_values(save_path) model.set_parameter_values(params) if mode == "train": # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_parameters().items()], width=120)) train_stream = train_dataset.get_example_stream() train_stream = Mapping(train_stream, _truncate) train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) train_stream = Mapping(train_stream, _transpose) parameters = model.get_parameter_dict() maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values()) algorithm = GradientDescent( cost=batch_cost, parameters=parameters.values(), step_rule=CompositeRule([StepClipping(1000.), AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects) ])) ft = features[:6, 0] ft.name = 'feature_example' observables = [cost, ppl, char_cost, length, bits_per_char] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) track_the_best_bpc = TrackTheBest('valid_bits_per_char') root_path, extension = os.path.splitext(save_path) this_step_monitoring = TrainingDataMonitoring( observables + [ft], prefix="this_step", after_batch=True) average_monitoring = TrainingDataMonitoring( observables + [algorithm.total_step_norm, algorithm.total_gradient_norm], prefix="average", every_n_batches=10) valid_monitoring = DataStreamMonitoring( observables, prefix="valid", every_n_batches=1500, before_training=False, data_stream=valid_stream) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=model, extensions=[ this_step_monitoring, average_monitoring, valid_monitoring, track_the_best_bpc, Checkpoint(save_path, ), Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_bpc.notification_name), (root_path + "_best" + extension,)), Timing(after_batch=True), Printing(every_n_batches=10), Plot(root_path, [[average_monitoring.record_name(cost), valid_monitoring.record_name(cost)], [average_monitoring.record_name(algorithm.total_step_norm)], [average_monitoring.record_name(algorithm.total_gradient_norm)], [average_monitoring.record_name(ppl), valid_monitoring.record_name(ppl)], [average_monitoring.record_name(char_cost), valid_monitoring.record_name(char_cost)], [average_monitoring.record_name(bits_per_char), valid_monitoring.record_name(bits_per_char)]], every_n_batches=10) ]) main_loop.run() elif mode == 'evaluate': with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f: raw_words = [line.split()[1:-1] for line in f.readlines()] words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] for w in raw_words] max_word_length = max([len(w) for w in words]) initial_states = tensor.matrix('init_states') cost_matrix_step = generator.cost_matrix(features, mask=features_mask, states=initial_states) cg = ComputationGraph(cost_matrix_step) states = cg.auxiliary_variables[-2] compute_cost = theano.function([features, features_mask, initial_states], [cost_matrix_step.sum(axis=0), states]) cost_matrix = generator.cost_matrix(features, mask=features_mask) initial_cg = ComputationGraph(cost_matrix) initial_states = initial_cg.auxiliary_variables[-2] total_word_cost = 0 num_words = 0 examples = numpy.zeros((max_word_length + 1, len(words)), dtype='int64') all_masks = numpy.zeros((max_word_length + 1, len(words)), dtype=floatX) for i, word in enumerate(words): examples[:len(word), i] = word all_masks[:len(word), i] = 1. single_space = numpy.array([char_to_ind[' ']])[:, None] for batch in valid_stream.get_epoch_iterator(): for example, mask in equizip(batch[0].T, batch[1].T): example = example[:(mask.sum())] spc_inds = list(numpy.where(example == char_to_ind[" "])[0]) state = generator.transition.transition.initial_states_.get_value()[None, :] for i, j in equizip([-1] + spc_inds, spc_inds + [-1]): word = example[(i+1):j, None] word_cost, states = compute_cost( word, numpy.ones_like(word, dtype=floatX), state) state = states[-1] costs = numpy.exp(-compute_cost( examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0]) _, space_states = compute_cost( single_space, numpy.ones_like(single_space, dtype=floatX), state) state = space_states[-1] word_prob = numpy.exp(-word_cost) total_word_cost += word_cost + numpy.log(numpy.sum(costs)) num_words += 1 print(word_prob) print(numpy.sum(costs)) print("Average cost", total_word_cost / num_words) print("PPL", numpy.exp(total_word_cost / num_words)) print("Word-level perplexity") print(total_word_cost / num_words) else: assert False
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations,) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = chars.shape[0].copy(name="max_length") cost_per_character = aggregation.mean( batch_cost, batch_size * max_length).copy( name="character_log_likelihood") min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") mean_activation = abs(activations).mean().copy( name="mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, parameter in parameters.items(): observables.append(parameter.norm(2).copy(name + "_norm")) observables.append(algorithm.gradients[parameter].norm(2).copy( name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_parameter_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( applications=[reverser.generator.generate], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: try: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) except EOFError: break except Exception: traceback.print_exc() continue encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
dump_path = os.path.join("model_params", model_name+".pkl") # Build model m = config.Model(config, ds.vocab_size) # Build the Blocks stuff for training model = Model(m.probs)#sgd_cost) # model = Model(m.sgd_cost) # print [v for l in m.monitor_vars for v in l] print "new model:---------------------------------------------------------" #print model.get_parameter_values() try: with open(dump_path, 'r') as f: logger.info('Loading parameters from %s...'%dump_path) model.set_parameter_values(cPickle.load(f)) except IOError: pass #print model.get_parameter_dict() print "loaded model:---------------------------------------------------------" # tensor.nnet.softmax(m.sgd_cost) print model.outputs # print model.error_rate print "output ^ -------------\n" #print m.monitor_vars_valid # print model.error_rate print "mon vars valid ^ -------------\n" m.probs.name = "probs" m.cost.name = "cost" testpred = PredictDataStream(test_stream, [m.probs, m.cost] , "results.txt")