def train(self, X, Y, idx_folds, hyper_params, model_prefix, verbose=False): import os from collections import OrderedDict from fuel.datasets import IndexableDataset from blocks.model import Model from blocks.bricks import Linear, Softmax from blocks.bricks.conv import MaxPooling from blocks.initialization import Uniform from deepthought.bricks.cost import HingeLoss import numpy as np import theano from theano import tensor assert model_prefix is not None fold_weights_filename = '{}_weights.npy'.format(model_prefix) # convert Y to one-hot encoding n_classes = len(set(Y)) Y = np.eye(n_classes, dtype=int)[Y] features = tensor.matrix('features', dtype=theano.config.floatX) targets = tensor.lmatrix('targets') input_ = features dim = X.shape[-1] # optional additional layers if self.pipeline_factory is not None: # need to re-shape flattened input to restore bc01 format input_shape = (input_.shape[0],) + hyper_params['classifier_input_shape'] # tuple, uses actual batch size input_ = input_.reshape(input_shape) pipeline = self.pipeline_factory.build_pipeline(input_shape, hyper_params) input_ = pipeline.apply(input_) input_ = input_.flatten(ndim=2) # this is very hacky, but there seems to be no elegant way to obtain a value for dim dummy_fn = theano.function(inputs=[features], outputs=input_) dummy_out = dummy_fn(X[:1]) dim = dummy_out.shape[-1] if hyper_params['classifier_pool_width'] > 1: # FIXME: this is probably broken! # c = hyper_params['num_components'] # input_ = input_.reshape((input_.shape[0], c, input_.shape[-1] // c, 1)) # restore bc01 # need to re-shape flattened input to restore bc01 format input_shape = hyper_params['classifier_pool_input_shape'] # tuple input_ = input_.reshape(input_shape) pool = MaxPooling(name='pool', input_dim=input_shape[1:], # (c, X.shape[-1] // c, 1), pooling_size=(hyper_params['classifier_pool_width'], 1), step=(hyper_params['classifier_pool_stride'], 1)) input_ = pool.apply(input_) input_ = input_.reshape((input_.shape[0], tensor.prod(input_.shape[1:]))) dim = np.prod(pool.get_dim('output')) linear = Linear(name='linear', input_dim=dim, output_dim=n_classes, weights_init=Uniform(mean=0, std=0.01), use_bias=False) linear.initialize() softmax = Softmax('softmax') probs = softmax.apply(linear.apply(input_)) prediction = tensor.argmax(probs, axis=1) model = Model(probs) # classifier with raw probability outputs predict = theano.function([features], prediction) # ready-to-use predict function if os.path.isfile(fold_weights_filename): # load filter weights from existing file fold_weights = np.load(fold_weights_filename) print 'loaded filter weights from', fold_weights_filename else: # train model from blocks.bricks.cost import MisclassificationRate from blocks.filter import VariableFilter from blocks.graph import ComputationGraph from blocks.roles import WEIGHT from blocks.bricks import Softmax from blocks.model import Model from blocks.algorithms import GradientDescent, Adam from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.predicates import OnLogRecord from fuel.streams import DataStream from fuel.schemes import SequentialScheme, ShuffledScheme from blocks.monitoring import aggregation from blocks.main_loop import MainLoop from blocks.extensions.training import TrackTheBest from deepthought.extensions.parameters import BestParams # from deepthought.datasets.selection import DatasetMetaDB init_param_values = model.get_parameter_values() cost = HingeLoss().apply(targets, probs) # Note: this requires just the class labels, not in a one-hot encoding error_rate = MisclassificationRate().apply(targets.argmax(axis=1), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) # L1 regularization if hyper_params['classifier_l1wdecay'] > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + hyper_params['classifier_l1wdecay'] * sum([abs(W).sum() for W in weights]) cost.name = 'cost' # iterate over trial folds fold_weights = [] fold_errors = [] # for ifi, ifold in fold_generator.get_inner_cv_folds(outer_fold): # # train_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['train']) # valid_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['valid']) # # metadb = DatasetMetaDB(meta, train_selectors.keys()) # # # get selected trial IDs # train_idx = metadb.select(train_selectors) # valid_idx = metadb.select(valid_selectors) for train_idx, valid_idx in idx_folds: # print train_idx # print valid_idx trainset = IndexableDataset(indexables=OrderedDict( [('features', X[train_idx]), ('targets', Y[train_idx])])) validset = IndexableDataset(indexables=OrderedDict( [('features', X[valid_idx]), ('targets', Y[valid_idx])])) model.set_parameter_values(init_param_values) best_params = BestParams() best_params.add_condition(['after_epoch'], predicate=OnLogRecord('error_rate_valid_best_so_far')) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) extensions = [Timing(), FinishAfter(after_n_epochs=hyper_params['classifier_max_epochs']), DataStreamMonitoring( [cost, error_rate], DataStream.default_stream( validset, iteration_scheme=SequentialScheme( validset.num_examples, hyper_params['classifier_batch_size'])), suffix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], suffix="train", after_epoch=True), TrackTheBest('error_rate_valid'), best_params # after TrackTheBest! ] if verbose: extensions.append(Printing()) # optional extensions.append(ProgressBar()) main_loop = MainLoop( algorithm, DataStream.default_stream( trainset, iteration_scheme=ShuffledScheme(trainset.num_examples, hyper_params['classifier_batch_size'])), model=model, extensions=extensions) main_loop.run() fold_weights.append(best_params.values['/linear.W']) fold_errors.append(main_loop.status['best_error_rate_valid']) # break # FIXME fold_errors = np.asarray(fold_errors).squeeze() print 'simple NN fold classification errors:', fold_errors fold_weights = np.asarray(fold_weights) # store filter weights for later analysis np.save(fold_weights_filename, fold_weights) weights = fold_weights.mean(axis=0) linear.parameters[0].set_value(weights) return model, predict
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator(LinearReadout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean(generator.cost(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop(algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[ FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring( [cost], prefix="this_step", after_every_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), SerializeMainLoop(save_path, every_n_batches=500), Printing(every_n_batches=100) ]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph( generator.generate(n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
def visualize_generate(cost, hidden_states, updates, train_stream, valid_stream, args): use_indices = has_indices(args.dataset) output_size = get_output_size(args.dataset) # Get presoft and its computation graph filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables)[0] cg = ComputationGraph(presoft) # Handle the theano shared variables that allow carrying the hidden # state givens, f_updates = carry_hidden_state(updates, 1, reset=not (use_indices)) if args.hide_all_except is not None: pass # Compile the theano function compiled = theano.function(inputs=cg.inputs, outputs=presoft, givens=givens, updates=f_updates) epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): all_ = next(epoch_iterator) all_sequence = all_[0][:, 0:1] targets = all_[1][:, 0:1] # In the case of characters and text if use_indices: init_ = all_sequence[:args.initial_text_length] # Time X Features probability_array = np.zeros((0, output_size)) generated_text = init_ for i in range(args.generated_text_lenght): presoft = compiled(generated_text) # Get the last value of presoft last_presoft = presoft[-1:, 0, :] # Compute the probability distribution probabilities = softmax(last_presoft) # Store it in the list probability_array = np.vstack( [probability_array, probabilities]) # Sample a character out of the probability distribution argmax = (args.softmax_sampling == 'argmax') last_output_sample = sample(probabilities, argmax)[:, None, :] # Concatenate the new value to the text generated_text = np.vstack( [generated_text, last_output_sample]) ploting_path = None if args.save_path is not None: ploting_path = os.path.join(args.save_path, 'prob_plot.png') # Convert with real characters whole_sentence = conv_into_char(generated_text[:, 0], args.dataset) initial_sentence = whole_sentence[:init_.shape[0]] selected_sentence = whole_sentence[init_.shape[0]:] logger.info(''.join(initial_sentence) + '...') logger.info(''.join(whole_sentence)) if ploting_path is not None: probability_plot(probability_array, selected_sentence, args.dataset, ploting_path) # In the case of sine wave dataset for example else: presoft = compiled(all_sequence) time_plot = presoft.shape[0] - 1 plt.plot(np.arange(time_plot), targets[:time_plot, 0, 0], label="target") plt.plot(np.arange(time_plot), presoft[:time_plot, 0, 0], label="predicted") plt.legend() plt.grid(True) plt.show()
############## # Test with first batch ############## x_tr, x_mask_tr = next(data_stream.get_epoch_iterator()) f1 = function([x, x_mask], cost) #print f1(x_tr, x_mask_tr) #ipdb.set_trace() ################ # Optimization Algorithm ################ cg = ComputationGraph(cost) model = Model(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Adam(lr)]), on_unused_sources='warn') train_monitor = TrainingDataMonitoring( variables=[cost], after_epoch = True, prefix="train") extensions = extensions=[ train_monitor, TrackTheBest('train_sequence_log_likelihood'),
def test_variable_filter(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name='linear1') brick2 = Bias(2, name='bias1') activation = Logistic(name='sigm') x = tensor.vector() h1 = brick1.apply(x) h2 = activation.apply(h1) h2.name = "h2act" y = brick2.apply(h2) cg = ComputationGraph(y) parameters = [brick1.W, brick1.b, brick2.parameters[0]] bias = [brick1.b, brick2.parameters[0]] brick1_bias = [brick1.b] # Testing filtering by role role_filter = VariableFilter(roles=[PARAMETER]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[FILTER]) assert [] == role_filter(cg.variables) # Testing filtering by role using each_role flag role_filter = VariableFilter(roles=[PARAMETER, BIAS]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True) assert not parameters == role_filter(cg.variables) assert bias == role_filter(cg.variables) # Testing filtering by bricks classes brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by bricks instances brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by brick instance brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by name name_filter = VariableFilter(name='W_norm') assert [cg.variables[2]] == name_filter(cg.variables) # Testing filtering by name regex name_filter_regex = VariableFilter(name_regex='W_no.?m') assert [cg.variables[2]] == name_filter_regex(cg.variables) # Testing filtering by theano name theano_name_filter = VariableFilter(theano_name='h2act') assert [cg.variables[11]] == theano_name_filter(cg.variables) # Testing filtering by theano name regex theano_name_filter_regex = VariableFilter(theano_name_regex='h2a.?t') assert [cg.variables[11]] == theano_name_filter_regex(cg.variables) # Testing filtering by application appli_filter = VariableFilter(applications=[brick1.apply]) variables = [cg.variables[1], cg.variables[8]] assert variables == appli_filter(cg.variables) # Testing filtering by application appli_filter_list = VariableFilter(applications=[brick1.apply]) assert variables == appli_filter_list(cg.variables) input1 = tensor.matrix('input1') input2 = tensor.matrix('input2') merge = Merge(['input1', 'input2'], [5, 6], 2) merged = merge.apply(input1, input2) merge_cg = ComputationGraph(merged) outputs = VariableFilter(roles=[OUTPUT], bricks=[merge])(merge_cg.variables) assert merged in outputs assert len(outputs) == 3 outputs_application = VariableFilter(roles=[OUTPUT], applications=[merge.apply ])(merge_cg.variables) assert outputs_application == [merged]
def train(cli_params): cli_params['save_dir'] = prepare_dir(cli_params['save_to']) logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=False) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim,) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr)) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) short_prints = { "train": { 'T_C_class': ladder.costs.class_corr, 'T_C_de': ladder.costs.denois.values(), }, "valid_approx": OrderedDict([ ('V_C_class', ladder.costs.class_clean), ('V_E', ladder.error.clean), ('V_C_de', ladder.costs.denois.values()), ]), "valid_final": OrderedDict([ ('VF_C_class', ladder.costs.class_clean), ('VF_E', ladder.error.clean), ('VF_C_de', ladder.costs.denois.values()), ]), } main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm), model=Model(ladder.costs.total), extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + ladder.costs.denois.values(), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + ladder.costs.denois.values(), make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, n_labeled=len(data.valid_ind), whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_final", after_n_epochs=p.num_epochs), TrainingDataMonitoring( [ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm] + ladder.costs.denois.values(), prefix="train", after_epoch=True), SaveParams(None, all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(p.save_dir, after_training=True), ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ]) main_loop.run() # Get results df = main_loop.log.to_dataframe() col = 'valid_final_error_rate_clean' logger.info('%s %g' % (col, df[col].iloc[-1])) if main_loop.log.status['epoch_interrupt_received']: return None return df
x_synth = mgcf02wav(mgc_reconstruct, f0_tr[this_sample]) x_synth = .95 * x_synth / max(abs(x_synth)) * 2**15 wavfile.write( save_dir + "samples/new/data" + num_sample + str(this_sample) + ".wav", 16000, x_synth.astype('int16')) main_loop = load(save_dir + "pkl/best_" + experiment_name + ".pkl") lookup, generator = main_loop.model.get_top_bricks() from theano import tensor, function phonemes = tensor.imatrix('phonemes') sample = ComputationGraph( generator.generate(attended=lookup.apply(phonemes), n_steps=phonemes.shape[0], batch_size=phonemes.shape[1], iterate=True)) sample_fn = sample.get_theano_function() outputs_bp = sample_fn(phonemes_tr)[3] for this_sample in range(n_samples): print "Iteration: ", this_sample outputs = outputs_bp sampled_f0 = outputs[:, :, -2] sampled_voiced = outputs[:, :, -1] print sampled_voiced.mean() print sampled_f0.max(), sampled_f0.min()
def train_model(cost, cross_entropy, updates, train_stream, valid_stream, args, gate_values=None): step_rule = learning_algorithm(args) cg = ComputationGraph(cost) # ADD REGULARIZATION # WEIGHT NOISE weight_noise = args.weight_noise if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg_train = apply_noise(cg, weights, weight_noise) cost = cg_train.outputs[0] cost.name = "cost_with_weight_noise" cg = ComputationGraph(cost) logger.info(cg.parameters) algorithm = GradientDescent(cost=cost, step_rule=step_rule, params=cg.parameters) algorithm.add_updates(updates) # extensions to be added extensions = [] if args.load_path is not None: extensions.append(Load(args.load_path)) outputs = [ variable for variable in cg.variables if variable.name == "presoft" ] if args.generate: extensions.append( TextGenerationExtension( outputs=outputs, generation_length=args.generated_text_lenght, initial_text_length=args.initial_text_length, every_n_batches=args.monitoring_freq, ploting_path=os.path.join(args.save_path, 'prob_plot.png'), softmax_sampling=args.softmax_sampling, dataset=args.dataset, updates=updates, interactive_mode=args.interactive_mode)) extensions.extend([ TrainingDataMonitoring([cost], prefix='train', every_n_batches=args.monitoring_freq, after_epoch=True), DataStreamMonitoring([cost, cross_entropy], valid_stream, args.mini_batch_size_valid, state_updates=updates, prefix='valid', before_first_epoch=not (args.visualize_gates), every_n_batches=args.monitoring_freq), ResetStates([v for v, _ in updates], every_n_batches=100), ProgressBar() ]) # Creating directory for saving model. if not args.interactive_mode: if not os.path.exists(args.save_path): os.makedirs(args.save_path) else: raise Exception('Directory already exists') early_stopping = EarlyStopping('valid_cross_entropy', args.patience, args.save_path, every_n_batches=args.monitoring_freq) # Visualizing extensions if args.interactive_mode: extensions.append(InteractiveMode()) if args.visualize_gates and (gate_values is not None): if args.rnn_type == "lstm": extensions.append( VisualizeGateLSTM(gate_values, updates, args.dataset, ploting_path=None)) elif args.rnn_type == "soft": extensions.append( VisualizeGateSoft(gate_values, updates, args.dataset, ploting_path=None)) else: assert (False) extensions.append(early_stopping) extensions.append(Printing(every_n_batches=args.monitoring_freq)) main_loop = MainLoop(model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def run_experiment(): np.random.seed(42) X = tensor.tensor4('features') nbr_channels = 3 image_shape = (5, 5) conv_layers = [ ConvolutionalLayer( filter_size=(2, 2), num_filters=10, activation=Rectifier().apply, border_mode='valid', pooling_size=(1, 1), weights_init=Uniform(width=0.1), #biases_init=Uniform(width=0.01), biases_init=Constant(0.0), name='conv0') ] conv_sequence = ConvolutionalSequence(conv_layers, num_channels=nbr_channels, image_size=image_shape) #conv_sequence.push_allocation_config() conv_sequence.initialize() flattener = Flattener() conv_output = conv_sequence.apply(X) y_hat = flattener.apply(conv_output) # Whatever. Not important since we're not going to actually train anything. cost = tensor.sqr(y_hat).sum() #L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[FILTER, BIAS])(ComputationGraph([y_hat]).variables)] L_grads_method_02 = [ tensor.grad(cost, v) for v in VariableFilter( roles=[BIAS])(ComputationGraph([y_hat]).variables) ] # works on the sum of the gradients in a mini-batch sum_square_norm_gradients_method_02 = sum( [tensor.sqr(g).sum() for g in L_grads_method_02]) D_by_layer = get_conv_layers_transformation_roles( ComputationGraph(conv_output)) individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_conv_transformations( D_by_layer, cost) # why does this thing depend on N again ? # I don't think I've used a cost that divides by N. N = 2 Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32) #Xtrain[1:,:,:,:] = 0.0 Xtrain[:, :, :, :] = 1.0 convolution_filter_variable = VariableFilter(roles=[FILTER])( ComputationGraph([y_hat]).variables)[0] convolution_filter_variable_value = convolution_filter_variable.get_value() convolution_filter_variable_value[:, :, :, :] = 1.0 #convolution_filter_variable_value[0,0,:,:] = 1.0 convolution_filter_variable.set_value(convolution_filter_variable_value) f = theano.function([X], [ cost, individual_sum_square_norm_gradients_method_00, sum_square_norm_gradients_method_02 ]) [c, v0, gs2] = f(Xtrain) #print "[c, v0, gs2]" L_c, L_v0, L_gs2 = ([], [], []) for n in range(N): [nc, nv0, ngs2] = f(Xtrain[n, :, :, :].reshape( (1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3]))) L_c.append(nc) L_v0.append(nv0) L_gs2.append(ngs2) print "Cost for whole mini-batch in single shot : %f." % c print "Cost for whole mini-batch accumulated : %f." % sum(L_c) print "" print "Square-norm of all gradients for each data point in single shot :" print v0.reshape((1, -1)) print "Square-norm of all gradients for each data point iteratively :" print np.array(L_gs2).reshape((1, -1)) print "" print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2))) print "" print "Ratios : " print np.array(L_gs2).reshape((1, -1)) / v0.reshape((1, -1))
def train(step_rule, input_dim, state_dim, label_dim, layers, epochs, seed, pretrain_alignment, uniform_alignment, dropout, beam_search, test_cost, experiment_path, window_features, features, pool_size, maximum_frames, initialization, weight_noise, to_watch, patience, plot, write_predictions, static_mask, drop_prob, drop_prob_states, drop_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, seq_len, input_drop, augment, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def numpy_rng(random_seed=None): if random_seed == None: random_seed = 1223 return numpy.random.RandomState(random_seed) #from utilities import onehot, unhot, vec2chars # from http://www.iro.umontreal.ca/~memisevr/code/logreg.py #def onehot(x,numclasses=None): #""" Convert integer encoding for class-labels (starting with 0 !) #to one-hot encoding. #The output is an array who's shape is the shape of the input array plus #an extra dimension, containing the 'one-hot'-encoded labels. #""" #if x.shape==(): #x = x[None] #if numclasses is None: #numclasses = x.max() + 1 #result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") #z = numpy.zeros(x.shape, dtype="int") #for c in range(numclasses): #z *= 0 #z[numpy.where(x==c)] = 1 #result[...,c] += z #return result.astype(theano.config.floatX) #framelen = 1 #50 = 50 ##data = np.load(os.path.join(os.environ['FUEL_DATA_PATH'], 'PennTreebankCorpus/char_level_penntree.npz'))#pentree_char_and_word.npz') #data = np.load('char_level_penntree.npz') #trainset = data['train'] #validset = data['valid'] #allletters = " etanoisrhludcmfpkgybw<>\nvN.'xj$-qz&0193#285\\764/*" #dictionary = dict(zip(list(set(allletters)), range(50))) #invdict = {v: k for k, v in dictionary.items()} #numtrain = len(trainset) / seq_len * seq_len #numvalid = len(validset) / seq_len * seq_len #trainset = trainset[:numtrain] #validset = validset[:numvalid] ##if testing: ## train_features_numpy = train_features_numpy[:32 * 5] ## valid_features_numpy = valid_features_numpy[:100] #train_targets = trainset.reshape(-1, seq_len*framelen)[:,1:] #valid_targets = validset.reshape(-1, seq_len*framelen)[:,1:] ## still only 2d (b, t*n) #train_features_numpy = onehot(trainset).reshape(-1, 50*seq_len*framelen)[:,:-50] #valid_features_numpy = onehot(validset).reshape(-1, 50*seq_len*framelen)[:,:-50] #del trainset, validset #data_loaded = True #print '... done' #test_value = train_features_numpy[:32] #################### ########################################### # # MAKE STREAMS # ########################################### rng = np.random.RandomState(seed) stream_args = dict(rng=rng, pool_size=pool_size, maximum_frames=maximum_frames, pretrain_alignment=pretrain_alignment, uniform_alignment=uniform_alignment, window_features=window_features) if share_mask: drop_prob_cells = drop_prob # we don't want to actually use these masks, so this is to debug drop_prob_states = None # the threes in here are because the number of layers is hardcoded to 3 atm. NIPS! print '.. initializing iterators' # train_stream, valid_stream = get_seq_mnist_streams( # h_dim, batch_size, update_prob) if static_mask: train_stream = get_static_mask_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, False, augment=augment) train_stream_evaluation = get_static_mask_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) dev_stream = get_static_mask_ptb_stream('valid', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) else: train_stream = get_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, False, augment=augment) train_stream_evaluation = get_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) dev_stream = get_ptb_stream('valid', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) #train_dataset = Timit('train', features=features) # assert (train_features_numpy[:,-50:].sum(axis=-2)==1).all() #train_features_numpy = train_features_numpy.reshape(-1, seq_len-1, 50)#BTN for shuffled dataset? #train_dataset = IndexableDataset(indexables=OrderedDict( #[('features', train_features_numpy), #('outputs', train_targets)])) #train_stream = construct_stream_np(train_dataset, state_dim, batch_size, len(train_targets), #drop_prob_states, drop_prob_cells, drop_prob_igates, #num_layers=num_layers, #is_for_test=False, stoch_depth=stoch_depth, share_mask=share_mask, #gaussian_drop=gaussian_drop, input_drop=input_drop, **stream_args) ##dev_dataset = Timit('dev', features=features) #valid_features_numpy = valid_features_numpy.reshape(-1, seq_len-1, 50) #dev_dataset = IndexableDataset(indexables=OrderedDict( #[('features', valid_features_numpy), #('outputs', valid_targets)])) #dev_stream = construct_stream_np(dev_dataset, state_dim, batch_size, len(valid_targets), #drop_prob_states, drop_prob_cells, drop_prob_igates, #num_layers=num_layers, #is_for_test=True, stoch_depth=stoch_depth, share_mask=share_mask, #gaussian_drop=gaussian_drop, input_drop=input_drop, **stream_args) ##test_dataset = Timit('test', features=features) ##test_stream = construct_stream(test_dataset, state_dim, drop_prob_states, drop_prob_cells, drop_prob_igates, 3, ## is_for_test=True, stoch_depth=stoch_depth, share_mask=share_mask, ## gaussian_drop=gaussian_drop, **stream_args) data = train_stream.get_epoch_iterator(as_dict=True).next() #import ipdb; ipdb.set_trace() #phone_dict = train_dataset.get_phoneme_dict() #phoneme_dict = {k: phone_to_phoneme_dict[v] # if v in phone_to_phoneme_dict else v # for k, v in phone_dict.iteritems()} #ind_to_phoneme = {v: k for k, v in phoneme_dict.iteritems()} #eol_symbol = ind_to_phoneme['<STOP>'] #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('features', dtype=floatX) x, y = x[:-1], x[1:] #T.lmatrix('outputs')# phonemes') drops_states = T.tensor3('drops_states') drops_cells = T.tensor3('drops_cells') drops_igates = T.tensor3('drops_igates') x.tag.test_value = data['features'] #y.tag.test_value = data['outputs'] drops_states.tag.test_value = data['drops_states'] drops_cells.tag.test_value = data['drops_cells'] drops_igates.tag.test_value = data['drops_igates'] if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=.2) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hid = Linear(50, state_dim * 4, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = DropLSTM(dim=state_dim, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn', ogates_zoneout=ogates_zoneout) elif rnn_type.lower() == 'gru': in_to_hid = Linear(50, state_dim * 3, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = DropGRU(dim=state_dim, weights_init=weights_init, activation=Tanh(), name='rnn') elif rnn_type.lower() == 'srnn': #FIXME!!! make ReLU in_to_hid = Linear(50, state_dim, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = DropSimpleRecurrent(dim=state_dim, weights_init=weights_init, activation=Rectifier(), name='rnn') else: raise NotImplementedError #lstm2 = DropLSTM(dim=state_dim, activation=Tanh(), model_type=6) #lstm3 = DropLSTM(dim=state_dim, activation=Tanh(), model_type=6) #encoder = DropMultiLayerEncoder(weights_init=weights_init, #biases_init=Constant(.0), #networks=[lstm1, lstm2, bidir3], #dims=[input_dim * window_features, #state_dim, #state_dim, #state_dim, #label_dim + 1]) #encoder.initialize() #drops_states = [drops_forw_states, drops_back_states] #drops_cells = [drops_forw_cells, drops_back_cells] #drops_igates = [drops_forw_igates, drops_back_igates] hid_to_out = Linear(state_dim, 50, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) in_to_hid.initialize() recurrent_layer.initialize() hid_to_out.initialize() h = in_to_hid.apply(x) if rnn_type.lower() == 'lstm': yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates)[0] else: yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates) y_hat_pre_softmax = hid_to_out.apply(yh) shape_ = y_hat_pre_softmax.shape # y_hat = Softmax().apply( # y_hat_pre_softmax.reshape((-1, shape_[-1])))# .reshape(shape_) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### # cost = CategoricalCrossEntropy().apply(y.flatten().astype('int64'), y_hat) def crossentropy_lastaxes(yhat, y): # for sequence of distributions/targets return -(y * T.log(yhat)).sum(axis=yhat.ndim - 1) def softmax_lastaxis(x): # for sequence of distributions return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape) yhat = softmax_lastaxis(y_hat_pre_softmax) cross_entropies = crossentropy_lastaxes(yhat, y) cross_entropy = cross_entropies.mean().copy(name="cross_entropy") cost = cross_entropy.copy(name="cost") batch_cost = cost.copy(name='batch_cost') nll_cost = cost.copy(name='nll_cost') bpc = (nll_cost / np.log(2.0)).copy(name='bpr') #nll_cost = aggregation.mean(batch_cost, batch_size).copy(name='nll_cost') cost_monitor = aggregation.mean( batch_cost, batch_size).copy(name='sequence_cost_monitor') cost_per_character = aggregation.mean( batch_cost, (seq_len - 1) * batch_size).copy(name='character_cost') cost_train = cost.copy(name='train_batch_cost') cost_train_monitor = cost_monitor.copy('train_batch_cost_monitor') cg_train = ComputationGraph([cost_train, cost_train_monitor]) ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) ## debugging nans stuff #gr = T.grad(norm_cost, cg_train.parameters, disconnected_inputs='ignore') #grf = theano.function([x, input_mask], gr) #grz = grf(x.tag.test_value, input_mask.tag.test_value) #params = cg_train.parameters #mynanz = [(pp, np.sum(gg)) for pp,gg in zip(params, grz) if np.isnan(np.sum(gg))] #for mm in mynanz: print mm ##import ipdb; ipdb.set_trace() elif penalty == 'hids': assert 'rnn_apply_states' in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): if output.name == 'rnn_apply_states': norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) ## debugging nans stuff #gr = T.grad(norm_cost, cg_train.parameters, disconnected_inputs='ignore') #grf = theano.function([x, input_mask], gr) #grz = grf(x.tag.test_value, input_mask.tag.test_value) #params = cg_train.parameters #mynanz = [(pp, np.sum(gg)) for pp,gg in zip(params, grz) if np.isnan(np.sum(gg))] #for mm in mynanz: print mm ##import ipdb; ipdb.set_trace() norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? cg_train = ComputationGraph([cost_train, cost_train_monitor]) #, norm_cost]) ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') cost_train_monitor = cg_train.outputs[1].copy( 'train_batch_cost_monitor') # if 'l2regularization' in kwargs: # weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) # cost_train += kwargs['l2regularization'] * sum([ # (weight ** 2).sum() for weight in weights]) # cost_train.name = 'cost_train' # cg_train = ComputationGraph(cost_train) model = Model(cost_train) train_cost_per_character = aggregation.mean( cost_train_monitor, (seq_len - 1) * batch_size).copy(name='train_character_cost') algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) observed_vars = [ cost_train, cost_train_monitor, train_cost_per_character, aggregation.mean(algorithm.total_gradient_norm) ] # parameters = model.get_parameter_dict() # for name, param in parameters.iteritems(): # observed_vars.append(param.norm(2).copy(name=name + "_norm")) # observed_vars.append( # algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_monitor = DataStreamMonitoring(variables=[nll_cost, bpc], data_stream=dev_stream, prefix="dev") #train_ctc_monitor = CTCMonitoring( #x, input_mask, #drops_forw_states, drops_forw_cells, drops_forw_igates, #drops_back_states, drops_back_cells, drops_back_igates, #y_hat, eol_symbol, train_stream, #prefix='train', every_n_epochs=1, #before_training=True, #phoneme_dict=phoneme_dict, #black_list=black_list, train=True) #dev_ctc_monitor = CTCMonitoring( #x, input_mask, #drops_forw_states, drops_forw_cells, drops_forw_igates, #drops_back_states, drops_back_cells, drops_back_igates, #y_hat, eol_symbol, dev_stream, #prefix='dev', every_n_epochs=1, #phoneme_dict=phoneme_dict, #black_list=black_list) extensions = [] # /u/pezeshki/speech_project/five_layer_timit/trained_params_best.npz if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name #_evaluator = CTCEvaluator(eol_symbol, x, input_mask, y_hat, #phoneme_dict=phoneme_dict, #black_list=black_list) #logger.info("CTC monitoring on TEST data started") #value_dict = _evaluator.evaluate(test_stream, False) #print value_dict.items() #logger.info("CTC monitoring on TEST data finished") #logger.info("CTC monitoring on TRAIN data started") #value_dict = _evaluator.evaluate(train_stream, True) #print value_dict.items() #logger.info("CTC monitoring on TRAIN data finished") #logger.info("CTC monitoring on DEV data started") #value_dict = _evaluator.evaluate(dev_stream, False) #print value_dict.items() #logger.info("CTC monitoring on DEV data finished") extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) #train_ctc_monitor, #dev_ctc_monitor]) if test_cost: test_monitor = DataStreamMonitoring( variables=[cost_monitor, cost_per_character], data_stream=test_stream, prefix="test") extensions.append(test_monitor) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_nll_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) # if write_predictions: # with open('predicted.txt', 'w') as f_pred: # with open('targets.txt', 'w') as f_targets: # evaluator = CTCEvaluator( # eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list) # evaluator.evaluate(dev_stream, file_pred=f_pred, # file_targets=f_targets) # return main_loop.run() print "Execution time: %f" % (time.time() - t1)
# load training data using Fuel mnist_train = MNIST("train") train_stream = Flatten(DataStream.default_stream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 128)), ) # load testing data mnist_test = MNIST("test") test_stream = Flatten(DataStream.default_stream( dataset=mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 1024)), ) # train the model from blocks.model import Model main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=GradientDescent( cost=cost, params=ComputationGraph(cost).parameters, step_rule=Scale(learning_rate=0.1)), extensions=[FinishAfter(after_n_epochs=5), DataStreamMonitoring( variables=[cost, error_rate], data_stream=test_stream, prefix="test"), Printing()]) main_loop.run()
top_mlp_dims = [numpy.prod(conv_sequence4.get_dim('output')) ] + [mlp_hiddens] + [output_size] top_mlp = MLP(mlp_activation, top_mlp_dims, weights_init=Uniform(width=0.2), biases_init=Constant(0.)) top_mlp.initialize() probs = top_mlp.apply(out) cost = CategoricalCrossEntropy(name='Cross1').apply(y.flatten(), probs).copy(name='cost1') error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) error_rate2 = error_rate.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme(
def train_ladder(cli_params, dataset=None, save_to='results/ova_all_full'): cli_params['save_dir'] = prepare_dir(save_to) logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr)) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) short_prints = { "train": { 'T_C_class': ladder.costs.class_corr, 'T_C_de': ladder.costs.denois.values(), }, "valid_approx": OrderedDict([ ('V_C_class', ladder.costs.class_clean), ('V_E', ladder.error.clean), ('V_C_de', ladder.costs.denois.values()), ]), "valid_final": OrderedDict([ ('VF_C_class', ladder.costs.class_clean), ('VF_E', ladder.error.clean), ('VF_C_de', ladder.costs.denois.values()), ]), } ovadataset = dataset['ovadataset'] train_indexes = dataset['train_indexes'] val_indexes = dataset['val_indexes'] main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(ovadataset, train_indexes, p.batch_size, scheme=ShuffledScheme), model=Model(ladder.costs.total), extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + ladder.costs.denois.values(), make_datastream(ovadataset, val_indexes, p.batch_size), prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( [ladder.costs.class_clean, ladder.error.clean_mc] + ladder.costs.denois.values(), make_datastream(ovadataset, train_indexes, p.batch_size), make_datastream(ovadataset, val_indexes, p.batch_size), prefix="valid_final", after_n_epochs=p.num_epochs), TrainingDataMonitoring([ ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm ] + ladder.costs.denois.values(), prefix="train", after_epoch=True), ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ]) main_loop.run() # Get results df = main_loop.log.to_dataframe() col = 'valid_final_error_matrix_cost' logger.info('%s %g' % (col, df[col].iloc[-1])) ds = make_datastream(ovadataset, val_indexes, p.batch_size) outputs = ladder.act.clean.labeled.h[len(ladder.layers) - 1] outputreplacer = TestMonitoring() _, _, outputs = outputreplacer._get_bn_params(outputs) cg = ComputationGraph(outputs) f = cg.get_theano_function() it = ds.get_epoch_iterator(as_dict=True) res = [] inputs = { 'features_labeled': [], 'targets_labeled': [], 'features_unlabeled': [] } # Loop over one epoch for d in it: # Store all inputs for k, v in d.iteritems(): inputs[k] += [v] # Store outputs res += [f(*[d[str(inp)] for inp in cg.inputs])] # Concatenate all minibatches res = [numpy.vstack(minibatches) for minibatches in zip(*res)] inputs = {k: numpy.vstack(v) for k, v in inputs.iteritems()} if main_loop.log.status['epoch_interrupt_received']: return None return res[0], inputs
def test_convolutional_layer(): batch_size=2 x = T.tensor4(); y = T.ivector() V = 200 layer_conv = Convolutional(filter_size=(5,5),num_filters=V, name="toto", weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0)) # try with no bias activation = Rectifier() pool = MaxPooling(pooling_size=(2,2)) convnet = ConvolutionalSequence([layer_conv, activation, pool], num_channels=15, image_size=(10,10), name="conv_section") convnet.push_allocation_config() convnet.initialize() output=convnet.apply(x) batch_size=output.shape[0] output_dim=np.prod(convnet.get_dim('output')) result_conv = output.reshape((batch_size, output_dim)) mlp=MLP(activations=[Rectifier().apply], dims=[output_dim, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0)) mlp.initialize() output=mlp.apply(result_conv) cost = T.mean(Softmax().categorical_cross_entropy(y.flatten(), output)) cg = ComputationGraph(cost) W = VariableFilter(roles=[WEIGHT])(cg.variables) B = VariableFilter(roles=[BIAS])(cg.variables) W = W[-1]; b = B[-1] print W.shape.eval() print b.shape.eval() import pdb pdb.set_trace() inputs_conv = VariableFilter(roles=[INPUT], bricks=[Convolutional])(cg) outputs_conv = VariableFilter(roles=[OUTPUT], bricks=[Convolutional])(cg) var_input=inputs_conv[0] var_output=outputs_conv[0] [d_W,d_S,d_b] = T.grad(cost, [W, var_output, b]) import pdb pdb.set_trace() w_shape = W.shape.eval() d_W = d_W.reshape((w_shape[0], w_shape[1]*w_shape[2]*w_shape[3])) d_b = T.zeros((w_shape[0],6*6)) #d_b = d_b.reshape((w_shape[0], 8*8)) d_p = T.concatenate([d_W, d_b], axis=1) d_S = d_S.dimshuffle((1, 0, 2, 3)).reshape((w_shape[0], batch_size, 6*6)).reshape((w_shape[0], batch_size*6*6)) #d_S = d_S.reshape((2,200, 64)) #x_value=1e3*np.random.ranf((1,15,10,10)) x_value = 1e3*np.random.ranf((2,15, 10, 10)) f = theano.function([x,y], [var_input, d_S, d_W], allow_input_downcast=True, on_unused_input='ignore') A, B, C= f(x_value, [5, 5]) print np.mean(B) return E_A = expansion_op(A, (2, 15, 10, 10), (5,5)) print E_A.shape E_A = E_A.reshape((2*36, C.shape[1])) print E_A.shape tmp = C - np.dot(B, E_A) print lin.norm(tmp, 'fro')
# Convert RGB to BGR texture_image_nn_input = texture_image_nn_input[::-1, :, :] - MEAN_VALUES texture_image_nn_input = texture_image_nn_input.astype('float32') # print texture_image_nn_input print texture_image_nn_input.shape f_features_gram = theano.function( inputs=[X], outputs=[gram_matrix(f) for f in texture_features(X)]) target_image_features = f_features_gram(texture_image_nn_input) # print target_image_features print[t.shape for t in target_image_features] from blocks.graph import ComputationGraph, apply_batch_normalization, get_batch_normalization_updates cg = ComputationGraph(generated_image_graph) cg_bn = apply_batch_normalization(cg) pop_updates = get_batch_normalization_updates(cg_bn) text_generated = texture_features(cg.outputs[0]) gram_generated = [gram_matrix(f) for f in text_generated] loss = 0 for i in range(len(target_image_features)): N = text_generated[i].shape[1] M = text_generated[i].shape[2] * text_generated[i].shape[3] loss += 1. / (4 * 16 * N**2 * M**2) * ( (gram_generated[i] - tensor.addbroadcast( theano.shared(target_image_features[i]), 0))**2).sum() alpha = 0.1
if not key.startswith('__') and isinstance(getattr(config, key), (int, str, list, tuple)): logger.info(' %20s %s' % (key, str(getattr(config, key)))) model = config.Model(config) model.initialize() stream = config.Stream(config) inputs = stream.inputs() req_vars = model.cost.inputs train_stream = stream.train(req_vars) valid_stream = stream.valid(req_vars) cost = model.cost(**inputs) cg = ComputationGraph(cost) monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables)) valid_monitored = monitored if hasattr(model, 'valid_cost'): valid_cost = model.valid_cost(**inputs) valid_cg = ComputationGraph(valid_cost) valid_monitored = set([valid_cost] + VariableFilter( roles=[roles.COST])(valid_cg.variables)) if hasattr(config, 'dropout') and config.dropout < 1.0: cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout) if hasattr(config, 'noise') and config.noise > 0.0: cg = apply_noise(cg, config.noise_inputs(cg), config.noise) cost = cg.outputs[0] cg = Model(cost)
def analyze(cli_params): p, _ = load_and_log_params(cli_params) _, data, whiten, cnorm = setup_data(p, test_set=True) ladder = setup_model(p) # Analyze activations dset, indices, calc_batchnorm = { 'train': (data.train, data.train_ind, False), 'valid': (data.valid, data.valid_ind, True), 'test': (data.test, data.test_ind, True), }[p.data_type] if calc_batchnorm: logger.info('Calculating batch normalization for clean.labeled path') main_loop = DummyLoop( extensions=[ FinalTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + ladder.costs.denois.values(), make_datastream(data.train, data.train_ind, # These need to match with the training p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=len(data.train_ind), cnorm=cnorm, whiten=whiten, scheme=ShuffledScheme), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, n_labeled=len(data.valid_ind), n_unlabeled=len(data.valid_ind), cnorm=cnorm, whiten=whiten, scheme=ShuffledScheme), prefix="valid_final", before_training=True), ShortPrinting({ "valid_final": OrderedDict([ ('VF_C_class', ladder.costs.class_clean), ('VF_E', ladder.error.clean), ('VF_C_de', [ladder.costs.denois.get(0), ladder.costs.denois.get(1), ladder.costs.denois.get(2), ladder.costs.denois.get(3)]), ]), }, after_training=True, use_log=False), ]) main_loop.run() # Make a datastream that has all the indices in the labeled pathway ds = make_datastream(dset, indices, batch_size=p.get('batch_size'), n_labeled=len(indices), n_unlabeled=len(indices), balanced_classes=False, whiten=whiten, cnorm=cnorm, scheme=SequentialScheme) # We want out the values after softmax outputs = ladder.act.clean.labeled.h[len(ladder.layers) - 1] # Replace the batch normalization paramameters with the shared variables if calc_batchnorm: outputreplacer = TestMonitoring() _, _, outputs = outputreplacer._get_bn_params(outputs) cg = ComputationGraph(outputs) f = cg.get_theano_function() it = ds.get_epoch_iterator(as_dict=True) res = [] inputs = {'features_labeled': [], 'targets_labeled': [], 'features_unlabeled': []} # Loop over one epoch for d in it: # Store all inputs for k, v in d.iteritems(): inputs[k] += [v] # Store outputs res += [f(*[d[str(inp)] for inp in cg.inputs])] # Concatenate all minibatches res = [numpy.vstack(minibatches) for minibatches in zip(*res)] inputs = {k: numpy.vstack(v) for k, v in inputs.iteritems()} return inputs['targets_labeled'], res[0]
# scale is applied before shift train_stream = ScaleAndShift(train_stream, scl, shft) test_stream = ScaleAndShift(test_stream, scl, shft) baseline_uniform_noise = 1./255. # appropriate for MNIST and CIFAR10 Fuel datasets, which are scaled [0,1] uniform_noise = baseline_uniform_noise/scl ## initialize the model dpm = model.DiffusionModel(spatial_width, n_colors, uniform_noise=uniform_noise, **model_args) dpm.initialize() ## set up optimization features = T.matrix('features', dtype=theano.config.floatX) cost = dpm.cost(features) blocks_model = blocks.model.Model(cost) cg_nodropout = ComputationGraph(cost) if args.dropout_rate > 0: # DEBUG this triggers an error on my machine # apply dropout to all the input variables inputs = VariableFilter(roles=[INPUT])(cg_nodropout.variables) # dropconnect # inputs = VariableFilter(roles=[PARAMETER])(cg_nodropout.variables) cg = apply_dropout(cg_nodropout, inputs, args.dropout_rate) else: cg = cg_nodropout step_compute = RMSProp(learning_rate=args.lr, max_scaling=1e10) algorithm = GradientDescent(step_rule=CompositeRule([RemoveNotFinite(), step_compute]), parameters=cg.parameters, cost=cost) extension_list = [] extension_list.append(
def main(name, epochs, batch_size, learning_rate): if name is None: name = "att-rw" print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print() #------------------------------------------------------------------------ img_height, img_width = 28, 28 read_N = 12 write_N = 14 inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } x_dim = img_height * img_width reader = ZoomableAttentionWindow(img_height, img_width, read_N) writer = ZoomableAttentionWindow(img_height, img_width, write_N) # Parameterize the attention reader and writer mlpr = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="RMLP", **inits) mlpw = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="WMLP", **inits) # MLP between the reader and writer mlp = MLP(activations=[Tanh(), Identity()], dims=[read_N**2, 300, write_N**2], name="MLP", **inits) for brick in [mlpr, mlpw, mlp]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') hr = mlpr.apply(x) hw = mlpw.apply(x) center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) r = reader.read(x, center_y, center_x, delta, sigma) h = mlp.apply(r) center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) c = writer.write(h, center_y, center_x, delta, sigma) / gamma x_recons = T.nnet.sigmoid(c) cost = BinaryCrossEntropy().apply(x, x_recons) cost.name = "cost" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ RemoveNotFinite(), Adam(learning_rate), StepClipping(3.), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: # v_mean = v.mean() # v_mean.name = v.name # monitors += [v_mean] # monitors += [aggregation.mean(v)] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["cost"], ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), #Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM( dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01) ) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
def main(config, tr_stream, dev_stream, use_bokeh=False): logger.info('Building RNN encoder-decoder') cost, samples, search_model = create_model(config) #cost, samples, search_model = create_multitask_model(config) logger.info("Building model") cg = ComputationGraph(cost) training_model = Model(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, src_vocab=config['src_vocab'], trg_vocab=config['trg_vocab'], phones_vocab=config['phones'], hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on f1 if config['f1_validation'] is not None: logger.info("Building f1 validator") extensions.append( F1Validator(samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_f1'], every_n_batches=config['f1_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])(), RemoveNotFinite() ]), on_unused_sources='warn') # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def main(config): vocab_src, _ = text_to_dict([config['train_src'], config['dev_src'], config['test_src']]) vocab_tgt, cabvo = text_to_dict([config['train_tgt'], config['dev_tgt']]) # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0], [1, 4, 8, 4, 8, 4, 8],] source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0], [1, 0, 1, 0, 1, 0, 1],] target_sentence.tag.test_value = [[0,1,1,5], [2,0,1,0],] target_sentence_mask.tag.test_value = [[0,1,1,0], [1,1,1,0],] logger.info('Building RNN encoder-decoder') ### Building Encoder embedder = LookupTable( length=len(vocab_src), dim=config['embed_src'], weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='embedder') transformer = Linear( config['embed_src'], config['hidden_src']*4, weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='transformer') lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src']) encoder = Bidirectional( LSTM( dim=config['hidden_src'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit)), name='encoderBiLSTM' ) encoder.prototype.weights_init = Orthogonal() ### Building Decoder lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']) transition = LSTM2GO( attended_dim=config['hidden_tgt'], dim=config['hidden_tgt'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit), name='decoderLSTM') attention = SequenceContentAttention( state_names=transition.apply.states, # default activation is Tanh state_dims=[config['hidden_tgt']], attended_dim=config['hidden_src']*2, match_dim=config['hidden_tgt'], name="attention") readout = Readout( source_names=['states', 'feedback', attention.take_glimpses.outputs[0]], readout_dim=len(vocab_tgt), emitter = SoftmaxEmitter( name='emitter'), feedback_brick = LookupFeedback( num_outputs=len(vocab_tgt), feedback_dim=config['embed_tgt'], name='feedback'), post_merge=InitializableFeedforwardSequence([ Bias(dim=config['hidden_tgt'], name='softmax_bias').apply, Linear(input_dim=config['hidden_tgt'], output_dim=config['embed_tgt'], use_bias=False, name='softmax0').apply, Linear(input_dim=config['embed_tgt'], name='softmax1').apply]), merged_dim=config['hidden_tgt']) decoder = SequenceGenerator( readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator", fork=Fork( [name for name in transition.apply.sequences if name != 'mask'], prototype=Linear()), add_contexts=True) decoder.transition.weights_init = Orthogonal() #printchildren(encoder, 1) # Initialize model logger.info('Initializing model') embedder.initialize() transformer.initialize() encoder.initialize() decoder.initialize() # Apply model embedded = embedder.apply(source_sentence) tansformed = transformer.apply(embedded) encoded = encoder.apply(tansformed)[0] generated = decoder.generate( n_steps=2*source_sentence.shape[1], batch_size=source_sentence.shape[0], attended = encoded.dimshuffle(1,0,2), attended_mask=tensor.ones(source_sentence.shape).T ) print 'Generated: ', generated # generator_generate_outputs #samples = generated[1] # For GRU samples = generated[2] # For LSTM samples.name = 'samples' #samples_cost = generated[4] # For GRU samples_cost = generated[5] # For LSTM samples_cost = 'sampling_cost' cost = decoder.cost( mask = target_sentence_mask.T, outputs = target_sentence.T, attended = encoded.dimshuffle(1,0,2), attended_mask = source_sentence_mask.T) cost.name = 'target_cost' cost.tag.aggregation_scheme = TakeLast(cost) model = Model(cost) logger.info('Creating computational graph') cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) ######## # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) printchildren(embedder, 1) printchildren(transformer, 1) printchildren(encoder, 1) printchildren(decoder, 1) # Print parameter names # enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) # enc_dec_param_dict = merge(Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict))) ########## # Training data train_stream = get_train_stream(config, [config['train_src'],], [config['train_tgt'],], vocab_src, vocab_tgt) dev_stream = get_dev_stream( [config['dev_src'],], [config['dev_tgt'],], vocab_src, vocab_tgt) test_stream = get_test_stream([config['test_src'],], vocab_src) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), ProgressBar(), TrainingDataMonitoring([cost], prefix="tra", after_batch=True), DataStreamMonitoring(variables=[cost], data_stream=dev_stream, prefix="dev", after_batch=True), Sampler( model=Model(samples), data_stream=dev_stream, vocab=cabvo, saveto=config['saveto']+'dev', every_n_batches=config['save_freq']), Sampler( model=Model(samples), data_stream=test_stream, vocab=cabvo, saveto=config['saveto']+'test', after_n_batches=1, on_resumption=True, before_training=True), Plotter(saveto=config['saveto'], after_batch=True), Printing(after_batch=True), Checkpoint( path=config['saveto'], parameters = cg.parameters, save_main_loop=False, every_n_batches=config['save_freq'])] if BOKEH_AVAILABLE: Plot('Training cost', channels=[['target_cost']], after_batch=True) if config['reload']: extensions.append(Load(path=config['saveto'], load_iteration_state=False, load_log=False)) else: with open(config['saveto']+'.txt', 'w') as f: pass # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=model, algorithm=algorithm, data_stream=train_stream, extensions=extensions) main_loop.run()
def main(num_epochs=50, batch_normalized=True, alpha=0.1): """Run the example. Parameters ---------- num_epochs : int, optional Number of epochs for which to train. batch_normalized : bool, optional Batch-normalize the training graph. Defaults to `True`. alpha : float, optional Weight to apply to a new sample when calculating running averages for population statistics (1 - alpha weight is given to the existing average). """ if batch_normalized: # Add an extra keyword argument that only BatchNormalizedMLP takes, # in order to speed things up at the cost of a bit of extra memory. mlp_class = BatchNormalizedMLP extra_kwargs = {'conserve_memory': False} else: mlp_class = MLP extra_kwargs = {} mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()], [2, 5, 5, 5, 3], weights_init=IsotropicGaussian(0.2), biases_init=Constant(0.), **extra_kwargs) mlp.initialize() # Generate a dataset with 3 spiral arms, using 8000 examples for # training and 2000 for testing. dataset = Spiral(num_examples=10000, classes=3, sources=['features', 'label'], noise=0.05) train_stream = DataStream(dataset, iteration_scheme=ShuffledScheme(examples=8000, batch_size=20)) test_stream = DataStream(dataset, iteration_scheme=SequentialScheme( examples=list(range(8000, 10000)), batch_size=2000)) # Build a cost graph; this contains BatchNormalization bricks that will # by default run in inference mode. features = tensor.matrix('features') label = tensor.lvector('label') prediction = mlp.apply(features) cost = CategoricalCrossEntropy().apply(label, prediction) misclass = MisclassificationRate().apply(label, prediction) misclass.name = 'misclass' # The default name for this is annoyingly long original_cg = ComputationGraph([cost, misclass]) if batch_normalized: cg = apply_batch_normalization(original_cg) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] else: cg = original_cg extra_updates = [] algorithm = GradientDescent(step_rule=Adam(0.001), cost=cg.outputs[0], parameters=cg.parameters) algorithm.add_updates(extra_updates) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, # Use the original cost and misclass variables so # that we monitor the (original) inference-mode graph. extensions=[ DataStreamMonitoring([cost, misclass], train_stream, prefix='train'), DataStreamMonitoring([cost, misclass], test_stream, prefix='test'), Printing(), FinishAfter(after_n_epochs=num_epochs) ]) main_loop.run() return main_loop
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) if name is None: tag = "watt" if attention else "woatt" lr_str = lr_tag(learning_rate) name = "%s-t%d-enc%d-dec%d-z%d-lr%s" % (tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #------------------------------------------------------------------------ x_dim = 28*28 img_height, img_width = (28, 28) rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention: read_N = 4 write_N = 7 read_dim = 2*read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=read_N, **inits) else: read_dim = 2*x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Tanh()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Tanh()], [ z_dim, 4*dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel( n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(3.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] """ for t in range(n_iter): kl_term_t = kl_terms[t,:].mean() kl_term_t.name = "kl_term_%d" % t x_recons_t = T.nnet.sigmoid(c[t,:,:]) recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) recons_term_t = recons_term_t.mean() recons_term_t.name = "recons_term_%d" % t monitors +=[kl_term_t, recons_term_t] """ train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], ["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX(DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX(DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), ## updates=scan_updates, prefix="test"), TrainingDataMonitoring( train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name+".pkl"), Plot(name, channels=plot_channels), ProgressBar(), Printing()]) main_loop.run()
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets", )) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost(chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies, ) = VariableFilter(applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations, ) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy( abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, parameter in parameters.items(): observables.append(named_copy(parameter.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[parameter].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring(observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_parameter_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter(bricks=[reverser.generator], name="outputs")(ComputationGraph( generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(samples) outputs, costs = beam_search.search({chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(config, test_stream, testing_model): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = test_stream.trg_bos target_space_idx = test_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # Extensions extensions = [] # Reload model if necessary if config['reload']: extensions.append(LoadNMT(testing_model)) # Set up beam search and sampling computation graphs if necessary if config['bleu_script'] is not None: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[config['transition_depth']])) # generated[config['transition_depth']] is next_outputs logger.info("Building bleu tester") extensions.append( BleuTester(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples=samples, config=config, model=search_model, data_stream=test_stream, testing_model=testing_model, normalize=config['normalized_bleu'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=None, data_stream=None, extensions=extensions) for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training')
f0_mean = data_stats['f0_mean'] f0_std = data_stats['f0_std'] save_dir = os.environ['RESULTS_DIR'] save_dir = os.path.join(save_dir, 'blizzard/') experiment_name = "f0_only_1" main_loop = load(save_dir + "pkl/best_" + experiment_name + ".pkl") generator = main_loop.model.get_top_bricks()[0] steps = 2048 n_samples = 1 sample = ComputationGraph( generator.generate(n_steps=steps, batch_size=n_samples, iterate=True)) sample_fn = sample.get_theano_function() outputs = sample_fn()[-2] voiced = outputs[:, :, 1] outputs = outputs[:, :, 0] outputs = outputs * f0_std + f0_mean outputs = outputs * voiced outputs = outputs.swapaxes(0, 1) outputs = outputs[0] pyplot.figure(figsize=(100, 15)) pyplot.plot(outputs, linewidth=3) pyplot.gca().set_xlim(0, 2048) pyplot.savefig(save_dir + "samples/best_" + experiment_name + "3.png")
def __init__(self, costs, tparams, step_rule, drop_input=None, learning_rate=None, clip_c=0., step_rule_kwargs=None, **kwargs): """ costs : dict, mapping cg_name to cost tparams : dict, mapping cg_name to shared parameters step_rule : str, optimizer drop_input : dict, mapping cg_name to drop_input ratio (float) learning_rate : theano tensor variable clip_c : float, gradient clipping threshold step_rule_kwargs : dict, additional arguments to the step rule """ self.costs = costs self.tparams = tparams self.step_rule = step_rule self.learning_rate = learning_rate self.clip_c = clip_c self.step_rule_kwargs = step_rule_kwargs self.num_cgs = len(costs) self.cg_names = costs.keys() if any([is_multiSource(cg) for cg in self.cg_names]): self.enc_ids, self.dec_ids = get_enc_dec_ids_mSrc(self.cg_names) else: self.enc_ids, self.dec_ids = get_enc_dec_ids(self.cg_names) self.f_grads = OrderedDict() self.f_grad_shareds = OrderedDict() self.f_updates = OrderedDict() self.drop_input = drop_input self._cost = None self.algorithms = OrderedDict() # blocks legacy if drop_input is None: self.drop_input = {name: 0.0 for name in costs.keys()} for cg_name in self.cg_names: cost = self.costs[cg_name] inps = ComputationGraph(cost).inputs params = make_ordered_dict(self.tparams[cg_name]) logger.info( "Initializing the training algorithm [{}]".format(cg_name)) logger.info("...computing gradient") grads = theano.tensor.grad( cost=cost, wrt=self.tparams[cg_name]) if self.clip_c > 0.: logger.info("...clipping gradients") g2 = 0. for g in grads: g2 += (g**2).sum() notfinite = tensor.isnan(g2) + tensor.isinf(g2) new_grads = [] for g in grads: p = self._get_p_from_g(cg_name, g, params) tmpg = tensor.switch( g2 > (self.clip_c**2), g / tensor.sqrt(g2) * self.clip_c, g) new_grads.append( tensor.switch(notfinite, numpy.float32(.1) * p, tmpg)) grads = new_grads start_time = time.time() logger.info("...building optimizer",) lr = tensor.scalar(name='lr') self.f_grad_shareds[cg_name], self.f_updates[cg_name], \ step_rule_updates = eval( self.step_rule)(lr, params, grads, inps, cost, **self.step_rule_kwargs) logger.info(" took: {} seconds".format(time.time() - start_time)) # blocks legacy, just a helper self.algorithms[cg_name] = Algorithm(cost, inps, params, grads, step_rule_updates)
def main_rnn(config): x = tensor.tensor3('features') y = tensor.matrix('targets') # if 'LSTM' in config['model'] : # from models import getLSTMstack # y_hat = getLSTMstack(input_dim=13, input_var=x, depth=int(config['model'][-1])) # else : # raise Exception("These are not the LSTM we are looking for") # y_hat = model.apply(x) emitter = TestEmitter() # emitter = TrivialEmitter(readout_dim=config['lstm_hidden_size']) # cost_func = SquaredError() # @application # def qwe(self, readouts, outputs=None): # print(type(self), type(readouts)) # x = cost_func.apply(readouts,outputs) # return x print(type(emitter.cost)) # emitter.cost = qwe # print(type(qwe)) steps = 2 n_samples= config['target_size'] transition = [LSTM(config['lstm_hidden_size']) for _ in range(4)] transition = RecurrentStack(transition, name="transition", skip_connections=False) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout(emitter, readout_dim=config['lstm_hidden_size'], source_names=source_names,feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None) seqgen = SequenceGenerator(readout, transition, attention=None, add_contexts=False) seqgen.weights_init = IsotropicGaussian(0.01) seqgen.biases_init = Constant(0.) seqgen.push_initialization_config() seqgen.transition.biases_init = IsotropicGaussian(0.01,1) seqgen.transition.push_initialization_config() seqgen.initialize() states = seqgen.transition.apply.outputs print('states',states) states = {name: shared_floatx_zeros((n_samples, config['lstm_hidden_size'])) for name in states} cost_matrix = seqgen.cost_matrix(x, **states) cost = cost_matrix.mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) #Cost # cost = SquaredError().apply(y_hat ,y) #cost = CategoricalCrossEntropy().apply(T.flatten(),Y) # #for sampling #cg = ComputationGraph(seqgen.generate(n_steps=steps,batch_size=n_samples, iterate=True)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=config['learning_rate'])) #Getting the stream train_stream = MFCC.get_stream(config['batch_size'],config['source_size'],config['target_size'],config['num_examples']) #Monitoring stuff extensions = [Timing(), FinishAfter(after_n_batches=config['num_batches']), #DataStreamMonitoring([cost, error_rate],test_stream,prefix="test"), TrainingDataMonitoring([cost], prefix="train", every_n_batches=1), #Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=1)] main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) main_loop.run()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') bricks = [] # set time as first dimension question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) # Embed questions embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) # Create and apply LSTM stack curr_dim = [config.embed_size] curr_hidden = [qembed] hidden_list = [] for k, dim in enumerate(config.lstm_size): fwd_lstm_ins = [ Linear(input_dim=d, output_dim=4 * dim, name='fwd_lstm_in_%d_%d' % (k, l)) for l, d in enumerate(curr_dim) ] fwd_lstm = LSTM(dim=dim, activation=Tanh(), name='fwd_lstm_%d' % k) bwd_lstm_ins = [ Linear(input_dim=d, output_dim=4 * dim, name='bwd_lstm_in_%d_%d' % (k, l)) for l, d in enumerate(curr_dim) ] bwd_lstm = LSTM(dim=dim, activation=Tanh(), name='bwd_lstm_%d' % k) bricks = bricks + [fwd_lstm, bwd_lstm ] + fwd_lstm_ins + bwd_lstm_ins fwd_tmp = sum( x.apply(v) for x, v in zip(fwd_lstm_ins, curr_hidden)) bwd_tmp = sum( x.apply(v) for x, v in zip(bwd_lstm_ins, curr_hidden)) fwd_hidden, _ = fwd_lstm.apply(fwd_tmp, mask=question_mask.astype( theano.config.floatX)) bwd_hidden, _ = bwd_lstm.apply(bwd_tmp[::-1], mask=question_mask.astype( theano.config.floatX)[::-1]) hidden_list = hidden_list + [fwd_hidden, bwd_hidden] if config.skip_connections: curr_hidden = [qembed, fwd_hidden, bwd_hidden[::-1]] curr_dim = [config.embed_size, dim, dim] else: curr_hidden = [fwd_hidden, bwd_hidden[::-1]] curr_dim = [dim, dim] # Create and apply output MLP if config.skip_connections: out_mlp = MLP(dims=[2 * sum(config.lstm_size)] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks.append(out_mlp) probs = out_mlp.apply( tensor.concatenate([h[-1, :, :] for h in hidden_list], axis=1)) else: out_mlp = MLP(dims=[2 * config.lstm_size[-1]] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks.append(out_mlp) probs = out_mlp.apply( tensor.concatenate([h[-1, :, :] for h in hidden_list[-2:]], axis=1)) is_candidate = tensor.eq( tensor.arange(config.n_entities, dtype='int32')[None, None, :], tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate pred = probs.argmax(axis=1) cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, hidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()