def visualize_singular_values(args): param_values = load_parameter_values(args.load_path) for d in range(args.layers): if args.rnn_type == 'lstm': ws = param_values["/recurrentstack/lstm_" + str(d) + ".W_state"] w_rec = ws[:, 3 * args.state_dim:] elif args.rnn_type == 'simple': w_rec = param_values["/recurrentstack/simplerecurrent_" + str(d) + ".W_state"] else: raise NotImplementedError U, s, V = np.linalg.svd(w_rec, full_matrices=True) plt.subplot(2, 1, 1) plt.plot(np.arange(s.shape[0]), s, label='Layer_' + str(d)) plt.grid(True) plt.legend(loc='upper right') plt.title("Singular_values_of_recurrent_weights") plt.subplot(2, 1, 2) plt.plot(np.arange(s.shape[0]), np.log(s + 1E-15), label='Layer_' + str(d)) plt.grid(True) plt.title("Log_singular_values_of_recurrent_weights") plt.tight_layout() plt.savefig(args.save_path + "/visualize_singular_values.png") logger.info("Figure \"visualize_singular_values" ".png\" saved at directory: " + args.save_path)
def load_params(self, path): graphs = [ self.get_cost_graph().outputs[0], ComputationGraph(self.get_generate_graph()['outputs']) ] param_values = load_parameter_values(path) for graph in graphs: SpeechModel(graph).set_parameter_values(param_values)
def load_to(self, main_loop): main_loop.model.set_parameter_values(load_parameter_values(self.path)) if self.load_iteration_state or self.load_log: with open(self.path, "rb") as source: loaded_main_loop = load(source) if self.load_log: main_loop.log = loaded_main_loop.log if self.load_iteration_state: main_loop.iteration_state = loaded_main_loop.iteration_state
def test_serialization(): # Create a simple brick with two parameters mlp = MLP(activations=[None, None], dims=[10, 10, 10], weights_init=Constant(1.), use_bias=False) mlp.initialize() W = mlp.linear_transformations[1].W W.set_value(W.get_value() * 2) # Check the data using numpy.load with NamedTemporaryFile(delete=False) as f: dump(mlp, f) numpy_data = numpy.load(f.name) assert set(numpy_data.keys()) == \ set(['mlp-linear_0.W', 'mlp-linear_1.W', 'pkl']) assert_allclose(numpy_data['mlp-linear_0.W'], numpy.ones((10, 10))) assert numpy_data['mlp-linear_0.W'].dtype == theano.config.floatX # Ensure that it can be unpickled mlp = load(f.name) assert_allclose(mlp.linear_transformations[1].W.get_value(), numpy.ones((10, 10)) * 2) # Ensure that only parameters are saved as NPY files mlp.random_data = numpy.random.rand(10) with NamedTemporaryFile(delete=False) as f: dump(mlp, f) numpy_data = numpy.load(f.name) assert set(numpy_data.keys()) == \ set(['mlp-linear_0.W', 'mlp-linear_1.W', 'pkl']) # Ensure that parameters can be loaded with correct names parameter_values = load_parameter_values(f.name) assert set(parameter_values.keys()) == \ set(['/mlp/linear_0.W', '/mlp/linear_1.W']) # Ensure that duplicate names are dealt with for child in mlp.children: child.name = 'linear' with NamedTemporaryFile(delete=False) as f: dump(mlp, f) numpy_data = numpy.load(f.name) assert set(numpy_data.keys()) == \ set(['mlp-linear.W', 'mlp-linear.W_2', 'pkl']) # Ensure warnings are raised when __main__ namespace objects are dumped foo.__module__ = '__main__' import __main__ __main__.__dict__['foo'] = foo mlp.foo = foo with NamedTemporaryFile(delete=False) as f: with warnings.catch_warnings(record=True) as w: dump(mlp, f) assert len(w) == 1 assert '__main__' in str(w[-1].message)
def run_visualizations(cost, updates, train_stream, valid_stream, args, hidden_states=None, gate_values=None): # Load the parameters from a dumped model assert args.load_path is not None model = Model(cost) model.set_parameter_values(load_parameter_values(args.load_path)) # Run a visualization if args.visualize == "generate": visualize_generate(cost, hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "gates" and (gate_values is not None): if args.rnn_type == "lstm": visualize_gates_lstm(gate_values, hidden_states, updates, train_stream, valid_stream, args) elif args.rnn_type == "soft": visualize_gates_soft(gate_values, hidden_states, updates, train_stream, valid_stream, args) else: assert False elif args.visualize == "states": visualize_states(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "gradients": visualize_gradients(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "jacobian": visualize_jacobian(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "presoft": visualize_presoft(cost, hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "matrices": visualize_matrices(args) elif args.visualize == "trained_singular_values": visualize_singular_values(args) elif args.visualize == "gradients_flow_pie": visualize_gradients_flow_pie(hidden_states, updates, args) else: assert False
def fine_tuning(cost, args): param_values = load_parameter_values(args.fine_tuning) param_values[ "/output_layer.W"] = np.concatenate(( param_values["/output_layer.W"], 0.1 * np.random.randn(args.state_dim, 40).astype(np.float32))) model = Model(cost) model.set_parameter_values(param_values) return cost
def load_to(self, main_loop): aux_param_step = load_parameter_values(self.path) #ugly hacking, but Im super lazy, and this only needs to work for a few days with a limited set of models # I changed stuff in set_paramter_values, remember, it is ugly main_loop.model.set_parameter_values(aux_param_step) if self.load_iteration_state or self.load_log: with open(self.path, "rb") as source: loaded_main_loop = load(source) if self.load_log: main_loop.log = loaded_main_loop.log if self.load_iteration_state: main_loop.iteration_state = loaded_main_loop.iteration_state
def visualize_eigenvalues(args): path = args.load_path layers = args.layers param_values = load_parameter_values(path) print(param_values.keys()) matrices = {} for i in range(layers): matrices[i] = param_values["/recurrentstack/simplerecurrent_" + str(i) + ".W"] u, diag, v = svd(matrices[i]) print(diag)
def visualize_eigenvalues(args): path = args.load_path layers = args.layers param_values = load_parameter_values(path) print(param_values.keys()) matrices = {} for i in range(layers): matrices[i] = param_values[ "/recurrentstack/simplerecurrent_" + str(i) + ".W"] u, diag, v = svd(matrices[i]) print(diag)
def visualize_matrices(args): param_values = load_parameter_values(args.load_path) print(param_values.keys()) input0 = param_values["/fork/fork_inputs/lookuptable.W_lookup"] input1 = param_values["/fork/fork_inputs_1/lookuptable.W_lookup"] input2 = param_values["/fork/fork_inputs_2/lookuptable.W_lookup"] input3 = param_values["/fork/fork_inputs_3/lookuptable.W_lookup"] inputall = np.abs(np.concatenate((input0, input1, input2, input3), axis=1)) outputall = np.abs(param_values["/output_layer.W"]) # plt.matshow(inputall / np.mean(inputall), cmap=plt.cm.gray) # plt.matshow(outputall / np.mean(outputall), cmap=plt.cm.gray) plt.plot(np.arange(outputall.shape[0]), np.sum(np.abs(outputall), axis=1)) if args.local: plt.show() else: plt.savefig(args.save_path + "/visualize_matrices.png") logger.info("Figure \"visualize_matrices" ".png\" saved at directory: " + args.save_path)
def initialize_graph(recognizer, data, config, params): # Separate attention_params to be handled differently # when regularization is applied attentions = recognizer.all_children().generator.transition.attention.get() attention_params = [Selector(attention).get_parameters().values() for attention in attentions] logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) observables = [] # monitored each batch cg = recognizer.get_cost_graph(batch=True) labels = [] labels_mask = [] for chld in recognizer.children: lbls = VariableFilter(applications=[chld.cost], name='labels'+chld.names_postfix)(cg) lbls_mask = VariableFilter(applications=[chld.cost], name='labels_mask'+chld.names_postfix)(cg) if len(lbls) == 1: labels += lbls labels_mask += lbls_mask batch_cost = cg.outputs[0].sum() batch_size = rename(labels[0].shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=recognizer.all_children().bottom.apply.get(), name_regex="output")( cost_cg) attended = VariableFilter( applications=recognizer.all_children().generator.transition.apply.get(), name="attended")( cost_cg) attended_mask = VariableFilter( applications=recognizer.all_children().generator.transition.apply.get(), name="attended_mask")( cost_cg) weights = VariableFilter( applications=recognizer.all_children().generator.evaluate.get(), name="weights")( cost_cg) def get_renamed_list(rlist, elem_func, elem_name): return [rename(elem_func(elem), elem_name+chld.names_postfix) for elem,chld in zip(rlist, recognizer.children)] max_sentence_lengths = get_renamed_list(bottom_output, lambda e: e.shape[0], "max_sentence_length") max_attended_mask_lengths = get_renamed_list(attended_mask, lambda e: e.shape[0], "max_attended_mask_length") max_attended_lengths = get_renamed_list(attended, lambda e: e.shape[0], "max_attended_length") max_num_characters = get_renamed_list(labels, lambda e: e.shape[0], "max_num_characters") mean_attended = get_renamed_list(attended, lambda e: abs(e).mean(), "mean_attended") mean_bottom_output = get_renamed_list(bottom_output, lambda e: abs(e).mean(), "mean_bottom_output") mask_density = get_renamed_list(labels_mask, lambda e: e.mean(), "mask_density") weights_entropy = [rename(entropy(w, lm), "weights_entropy"+chld.names_postfix) for w, lm, chld in zip(weights, labels_mask, recognizer.children)] observables += max_attended_lengths + max_attended_mask_lengths + max_sentence_lengths # # Monitoring of cost terms is tricky because of Blocks #514 - since the # costs are annotations that are not part of the original output graph, # they are unaffected by replacements such as dropout!! # cost_terms = [] for chld in recognizer.children: chld_cost_terms = VariableFilter(applications=[chld.generator.evaluate], name_regex='.*_nll')(cost_cg) chld_cost_terms = [rename(var, var.name[:-4] + chld.names_postfix + '_nll') for var in chld_cost_terms] cost_terms += chld_cost_terms cg = ComputationGraph([cost, batch_size] + weights_entropy + mean_attended + mean_bottom_output + max_num_characters + mask_density + cost_terms) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): drop_conf = reg_config['dropout'] bot_drop = drop_conf.get('bottom', 0.0) if bot_drop: logger.info('apply bottom dropout') regularized_cg = apply_dropout(regularized_cg, bottom_output, bot_drop) enc_drop = drop_conf.get('encoder', 0.0) if enc_drop: logger.info('apply encoder dropout') enc_bricks = reduce(lambda acc,x: acc+list(x), recognizer.all_children().encoder.children.get(), []) enc_states = VariableFilter(bricks=enc_bricks, name_regex='states')(regularized_cg) regularized_cg = apply_dropout(regularized_cg, enc_states, enc_drop) post_merge_drop = drop_conf.get('post_merge', 0.0) if post_merge_drop: logger.info('apply post_merge dropout') pm_bricks = [] for chld in recognizer.children: cpm_bricks = list(chld.generator.readout.post_merge.children) cpm_bricks += cpm_bricks[-1].children cpm_bricks = [b for b in cpm_bricks if isinstance(b, type(chld.post_merge_activation))] pm_bricks += cpm_bricks regularized_cg = apply_dropout( regularized_cg, VariableFilter(bricks=pm_bricks, name='output')(regularized_cg), post_merge_drop) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = train_cost.copy(name='train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=SpeechModel(regularized_cg.outputs[0] ).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance if len(cost_terms): # Please note - the aggragation (mean) is done in # "attach_aggregation_schemes" ct_names = [v.name for v in cost_terms] for v in regularized_cg.outputs: if v.name in ct_names: observables.append(rename(v.sum()/batch_size, v.name)) for chld in recognizer.children: if chld.train_tags: tags_cost = VariableFilter(applications=[chld.addTagCost], name='output')(regularized_cg)[0] observables += [rename(tags_cost.sum()/batch_size, 'tags_nll'+chld.names_postfix)] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise param_values = load_parameter_values(params) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] return { 'observables': observables, 'max_norm_rules': max_norm_rules, 'cg': cg, 'regularized_cg' : regularized_cg, 'train_cost' : train_cost, 'cost' : cost, 'batch_size' : batch_size, 'batch_cost' : batch_cost, 'parameters' : parameters, 'gradients': gradients, 'model' : model, 'data' : data, 'recognizer' : recognizer, 'weights_entropy' : weights_entropy, 'labels_mask' : labels_mask, 'labels' : labels }
convnet.layers[0].weights_init = Uniform(width=0.2) convnet.layers[1].weights_init = Uniform(width=0.2) convnet.layers[2].weights_init = Uniform(width=0.2) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=0.2) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=0.2) convnet.initialize() ''' ######################################################### #Generate output and error signal predict = convnet.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') cg = ComputationGraph(cost) #Load the parameters of the model params = load_parameter_values('catsVsDogs256.pkl') mo = Model(predict) mo.set_parameter_values(params) print mo.inputs print dir(mo.inputs) print mo.outputs f = theano.function(mo.inputs, mo.outputs, allow_input_downcast=True) predictions = [] k = 0 for batch in stream_test.get_epoch_iterator(): example = numpy.array([batch[0]]) batch_predictions = f(example) #batch_predictions=batch_predictions[0] #get the array for result in batch_predictions: res = numpy.argmax(result)
def load_params(self, path): generated = self.get_generate_graph() param_values = load_parameter_values(path) SpeechModel(generated['outputs']).set_parameter_values(param_values)
def main(argv): name = argv[1] files = map(lambda p: join(folder, p), listdir(folder)) file = next(filter(lambda n: name in n, files)) print(file) p = load_parameter_values(file) net = net_dvc((128,128)) x = tensor.tensor4('image_features') y_hat = net.apply(x) g = Model(y_hat) for k,v in p.items(): p[k] = v.astype('float32') g.set_parameter_values(p) a,t,v = get_dvc((128,128),trainning=False, shortcut=False) run = function([x], y_hat) def run_test(data): res = [] for i in data.get_epoch_iterator(): res.extend(run(i[0])) return res def max_index(l): if l[0] > l[1]: return 0 else: return 1 def write_kaggle(f, l): f.write("id,label\n") for i,e in enumerate(l,start=1): f.write(str(i)+","+str(e)+"\n") def kaggle(file, data): write_kaggle(file,map(max_index, run_test(data))) def accuracy(data): res = [] true = [] for i in data.get_epoch_iterator(): res.extend(run(i[0])) true.extend(i[1]) res = map(max_index, res) total = 0 equal = 0 for r,t in zip(res,true): total += 1 equal += 1 if r == t else 0 return equal/total print("Training accuracy: ", accuracy(a)) print("Test accuracy: ", accuracy(v)) kaggle_file = join(result_folder, name+".kaggle") print(kaggle_file) with open(kaggle_file,'w') as f: kaggle(f, t)
for p in cg.parameters: print(str(p), p.shape, p.dtype) print("Created ComputationGraph, inputs:"); print(cg.inputs) # Strangely, all the examples use : DataStreamMonitoring in MainLoop model = Model(labels) print("Model.dict_of_inputs():"); print(model.dict_of_inputs()) print("Model list inputs:"); print([ v.name for v in model.inputs]) ## Model loading from saved file model.set_parameter_values(load_parameter_values(save_state_path)) examine_embedding(lookup.W.get_value()) label_ner = model.get_theano_function() print(model.inputs) print("printed label_ner.params") for test_data in data_stream.get_epoch_iterator(): ordered_batch = test_data[0:3] # Explicitly strip off the pre-defined labels #print(ordered_batch) results = label_ner(*ordered_batch) #print(results) # This is a pure array of labels inputs = _transpose(ordered_batch)
#get the test stream from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme, SequentialExampleScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, MaximumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift size = (128,128) cats = DogsVsCats(('test',)) stream = DataStream.default_stream(cats, iteration_scheme=SequentialExampleScheme(cats.num_examples)) stream_upscale = MaximumImageDimensions(stream, size, which_sources=('image_features',)) stream_scale = ScaleAndShift(stream_upscale, 1./255, 0, which_sources=('image_features',)) stream_data = Cast(stream_scale, dtype='float32', which_sources=('image_features',)) #Load the parameters of the model params = load_parameter_values('convnet_parameters.pkl') mo = Model(predict) mo.set_parameter_values(params) #Create the forward propagation function fprop = function(mo.inputs, mo.outputs[0], allow_input_downcast=True) tab = [] i = 1 #Get the prediction for each example of the test set for data in stream_data.get_epoch_iterator(): predict = np.argmax(fprop(data)) tab.append([i, predict]) print str(i) + "," + str(predict) i = i + 1 #Save predictions in a csv file np.savetxt("dump.csv", tab, delimiter=",", fmt='%d')
out2 = inception((None, None), 192, 64, 96, 128, 16, 32, 32, out, 10) out3 = inception((None, None), 256, 128, 128, 192, 32, 96, 64, out2, 20) out31 = MaxPooling((2, 2), name="poolLow").apply(out3) out4 = inception((None, None), 480, 192, 96, 208, 16, 48, 64, out31, 30) out5 = inception((None, None), 512, 160, 112, 224, 24, 64, 64, out4, 40) out6 = inception((None, None), 512, 128, 128, 256, 24, 64, 64, out5, 50) out7 = inception((None, None), 512, 112, 144, 288, 32, 64, 64, out6, 60) out8 = inception((None, None), 528, 256, 160, 320, 32, 128, 128, out7, 70) out81 = MaxPooling((20, 20), name="poolLow1").apply(out8) out9 = inception((None, None), 832, 256, 160, 320, 32, 128, 128, out81, 80) out10 = inception((None, None), 832, 384, 192, 384, 48, 128, 128, out9, 90) out91 = AveragePooling((5, 5), name="poolLow2").apply(out10) # Load the model params = load_parameter_values("catsVsDogs_google_new_Ortho.pkl") model1 = Model(out8) model1.set_parameter_values(params) cost = model1.outputs[0].sum() ** 2 # cost = (T.dot(model1.outputs[0].flatten().T,model1.outputs[0].flatten())**2).sum() f_prop = theano.function(model1.inputs, model1.outputs[0]) grad = T.grad(cost, model1.inputs) f_grad = theano.function(model1.inputs, grad) ######DEEP DREAM CODE ############# def preprocess(img): return np.float32(np.rollaxis(img, 2)[::-1]) - img.mean() def deprocess(img): return np.dstack((img + img.mean())[::-1])
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets", )) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost(chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies, ) = VariableFilter(applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations, ) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy( abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, parameter in parameters.items(): observables.append(named_copy(parameter.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[parameter].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring(observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_parameter_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter(bricks=[reverser.generator], name="outputs")(ComputationGraph( generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(samples) outputs, costs = beam_search.search({chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def run_visualizations(cost, updates, train_stream, valid_stream, args, hidden_states=None, gate_values=None): # Load the parameters from a dumped model assert args.load_path is not None param_values = load_parameter_values(args.load_path) if args.hide_all_except is not None: i = args.hide_all_except sdim = args.state_dim output_size = get_output_size(args.dataset) hidden = np.zeros((args.layers * sdim, output_size), dtype=np.float32) output_w = param_values["/output_layer.W"] hidden[i * sdim:(i + 1) * sdim, :] = output_w[i * sdim:(i + 1) * sdim, :] param_values["/output_layer.W"] = hidden model = Model(cost) model.set_parameter_values(param_values) # Run a visualization if args.visualize == "generate": visualize_generate(cost, hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "gates" and (gate_values is not None): if args.rnn_type == "lstm": visualize_gates_lstm(gate_values, hidden_states, updates, train_stream, valid_stream, args) elif args.rnn_type == "soft": visualize_gates_soft(gate_values, hidden_states, updates, train_stream, valid_stream, args) else: assert False elif args.visualize == "states": visualize_states(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "gradients": visualize_gradients(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "jacobian": visualize_jacobian(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "presoft": visualize_presoft(cost, hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "matrices": visualize_matrices(args) elif args.visualize == "trained_singular_values": visualize_singular_values(args) elif args.visualize == "gradients_flow_pie": visualize_gradients_flow_pie(hidden_states, updates, args) else: assert False
out2 = inception((None, None), 192, 64, 96, 128, 16, 32, 32, out, 10) out3 = inception((None, None), 256, 128, 128, 192, 32, 96, 64, out2, 20) out31 = MaxPooling((2, 2), name='poolLow').apply(out3) out4 = inception((None, None), 480, 192, 96, 208, 16, 48, 64, out31, 30) out5 = inception((None, None), 512, 160, 112, 224, 24, 64, 64, out4, 40) out6 = inception((None, None), 512, 128, 128, 256, 24, 64, 64, out5, 50) out7 = inception((None, None), 512, 112, 144, 288, 32, 64, 64, out6, 60) out8 = inception((None, None), 528, 256, 160, 320, 32, 128, 128, out7, 70) out81 = MaxPooling((20, 20), name='poolLow1').apply(out8) out9 = inception((None, None), 832, 256, 160, 320, 32, 128, 128, out81, 80) out10 = inception((None, None), 832, 384, 192, 384, 48, 128, 128, out9, 90) out91 = AveragePooling((5, 5), name='poolLow2').apply(out10) #Load the model params = load_parameter_values('catsVsDogs_google_new_Ortho.pkl') model1 = Model(out8) model1.set_parameter_values(params) cost = model1.outputs[0].sum()**2 #cost = (T.dot(model1.outputs[0].flatten().T,model1.outputs[0].flatten())**2).sum() f_prop = theano.function(model1.inputs, model1.outputs[0]) grad = T.grad(cost, model1.inputs) f_grad = theano.function(model1.inputs, grad) ######DEEP DREAM CODE ############# def preprocess(img): return np.float32(np.rollaxis(img, 2)[::-1]) - img.mean() def deprocess(img):
stream = DataStream.default_stream(cats, iteration_scheme=SequentialExampleScheme( cats.num_examples)) stream_upscale = MaximumImageDimensions(stream, size, which_sources=('image_features', )) stream_scale = ScaleAndShift(stream_upscale, 1. / 255, 0, which_sources=('image_features', )) stream_data = Cast(stream_scale, dtype='float32', which_sources=('image_features', )) #Load the parameters of the model params = load_parameter_values('convnet_parameters.pkl') mo = Model(predict) mo.set_parameter_values(params) #Create the forward propagation function fprop = function(mo.inputs, mo.outputs[0], allow_input_downcast=True) tab = [] i = 1 #Get the prediction for each example of the test set for data in stream_data.get_epoch_iterator(): predict = np.argmax(fprop(data)) tab.append([i, predict]) print str(i) + "," + str(predict) i = i + 1 #Save predictions in a csv file np.savetxt("dump.csv", tab, delimiter=",", fmt='%d')
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph( batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter( applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter( applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append( named_copy(gain_matrix.min(), 'min_gain')) primary_observables.append( named_copy(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = named_copy(recognizer.recordings.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output = VariableFilter( applications=[r.bottom.apply], name="output")( cost_cg)[-1] attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = named_copy(r.recordings.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = named_copy(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = named_copy(attended.shape[0], "max_attended_length") max_num_phonemes = named_copy(labels.shape[0], "max_num_phonemes") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_attended = named_copy(abs(attended).mean(), "mean_attended") mean_bottom_output = named_copy(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = named_copy(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = named_copy(entropy(weights, labels_mask), "weights_entropy") mask_density = named_copy(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = named_copy(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=SpeechModel(regularized_cg.outputs[0] ).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = named_copy( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = named_copy( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = named_copy( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise param_values = load_parameter_values(params) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']) .add_condition(["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains( labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] extensions.append(Patience(**patience_conf)) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
########### GET THE DATA ##################### stream_train = ServerDataStream(('image_features','targets'), False, port=5652, hwm=50) stream_valid = ServerDataStream(('image_features','targets'), False, port=5653, hwm=50) ########### DEFINE THE ALGORITHM ############# track_cost = TrackTheBest("cost", after_epoch=True, after_batch=False) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum(learning_rate=0.0001, momentum=0.9)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), TrainingDataMonitoring([cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid", after_epoch=True), Checkpoint("google_Ortho2_pretrain2_l0001.pkl", after_epoch=True), ProgressBar(), Printing()] #Adding a live plot with the bokeh server extensions.append(Plot( 'CatsVsDogs160_GoogleNet_Reload2_l0001', channels=[['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], ['train_total_gradient_norm']], after_batch=True)) params = load_parameter_values('GoogleParameters.pkl') model = Model(cost) model.set_parameter_values(params) main_loop = MainLoop(algorithm,data_stream=stream_train,model=model,extensions=extensions) main_loop.run()
for p in cg.parameters: print(str(p), p.shape, p.dtype) print("Created ComputationGraph, inputs:") print(cg.inputs) # Strangely, all the examples use : DataStreamMonitoring in MainLoop model = Model(labels) print("Model.dict_of_inputs():") print(model.dict_of_inputs()) print("Model list inputs:") print([v.name for v in model.inputs]) ## Model loading from saved file model.set_parameter_values(load_parameter_values(save_state_path)) examine_embedding(lookup.W.get_value()) label_ner = model.get_theano_function() print(model.inputs) print("printed label_ner.params") for test_data in data_stream.get_epoch_iterator(): ordered_batch = test_data[ 0:3] # Explicitly strip off the pre-defined labels #print(ordered_batch) results = label_ner(*ordered_batch) #print(results) # This is a pure array of labels
def main(mode, save_path, steps, num_batches, load_params): chars = (list(string.ascii_uppercase) + list(range(10)) + [' ', '.', ',', '\'', '"', '!', '?', '<UNK>']) char_to_ind = {char: i for i, char in enumerate(chars)} ind_to_char = {v: k for k, v in char_to_ind.iteritems()} train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'], char_to_ind, bos_token=None, eos_token=None, level='character') valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'], char_to_ind, bos_token=None, eos_token=None, level='character') vocab_size = len(char_to_ind) logger.info('Dictionary size: {}'.format(vocab_size)) if mode == 'continue': continue_training(save_path) return elif mode == "sample": main_loop = load(open(save_path, "rb")) generator = main_loop.model.get_top_bricks()[-1] sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] print("".join([ind_to_char[s] for s in outputs])) numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] return # Experiment configuration batch_size = 20 dim = 650 feedback_dim = 650 valid_stream = valid_dataset.get_example_stream() valid_stream = Batch(valid_stream, iteration_scheme=ConstantScheme(batch_size)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=vocab_size, source_names=transition.apply.states, emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=Uniform(std=0.04), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() transition.push_initialization_config() generator.initialize() # Build the cost computation graph. features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') cost_matrix = generator.cost_matrix( features, mask=features_mask) batch_cost = cost_matrix.sum() cost = aggregation.mean( batch_cost, features.shape[1]) cost.name = "sequence_log_likelihood" char_cost = aggregation.mean( batch_cost, features_mask.sum()) char_cost.name = 'character_log_likelihood' ppl = 2 ** (cost / numpy.log(2)) ppl.name = 'ppl' bits_per_char = char_cost / tensor.log(2) bits_per_char.name = 'bits_per_char' length = features.shape[0] length.name = 'length' model = Model(batch_cost) if load_params: params = load_parameter_values(save_path) model.set_parameter_values(params) if mode == "train": # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_parameters().items()], width=120)) train_stream = train_dataset.get_example_stream() train_stream = Mapping(train_stream, _truncate) train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) train_stream = Mapping(train_stream, _transpose) parameters = model.get_parameter_dict() maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values()) algorithm = GradientDescent( cost=batch_cost, parameters=parameters.values(), step_rule=CompositeRule([StepClipping(1000.), AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects) ])) ft = features[:6, 0] ft.name = 'feature_example' observables = [cost, ppl, char_cost, length, bits_per_char] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) track_the_best_bpc = TrackTheBest('valid_bits_per_char') root_path, extension = os.path.splitext(save_path) this_step_monitoring = TrainingDataMonitoring( observables + [ft], prefix="this_step", after_batch=True) average_monitoring = TrainingDataMonitoring( observables + [algorithm.total_step_norm, algorithm.total_gradient_norm], prefix="average", every_n_batches=10) valid_monitoring = DataStreamMonitoring( observables, prefix="valid", every_n_batches=1500, before_training=False, data_stream=valid_stream) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=model, extensions=[ this_step_monitoring, average_monitoring, valid_monitoring, track_the_best_bpc, Checkpoint(save_path, ), Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_bpc.notification_name), (root_path + "_best" + extension,)), Timing(after_batch=True), Printing(every_n_batches=10), Plot(root_path, [[average_monitoring.record_name(cost), valid_monitoring.record_name(cost)], [average_monitoring.record_name(algorithm.total_step_norm)], [average_monitoring.record_name(algorithm.total_gradient_norm)], [average_monitoring.record_name(ppl), valid_monitoring.record_name(ppl)], [average_monitoring.record_name(char_cost), valid_monitoring.record_name(char_cost)], [average_monitoring.record_name(bits_per_char), valid_monitoring.record_name(bits_per_char)]], every_n_batches=10) ]) main_loop.run() elif mode == 'evaluate': with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f: raw_words = [line.split()[1:-1] for line in f.readlines()] words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] for w in raw_words] max_word_length = max([len(w) for w in words]) initial_states = tensor.matrix('init_states') cost_matrix_step = generator.cost_matrix(features, mask=features_mask, states=initial_states) cg = ComputationGraph(cost_matrix_step) states = cg.auxiliary_variables[-2] compute_cost = theano.function([features, features_mask, initial_states], [cost_matrix_step.sum(axis=0), states]) cost_matrix = generator.cost_matrix(features, mask=features_mask) initial_cg = ComputationGraph(cost_matrix) initial_states = initial_cg.auxiliary_variables[-2] total_word_cost = 0 num_words = 0 examples = numpy.zeros((max_word_length + 1, len(words)), dtype='int64') all_masks = numpy.zeros((max_word_length + 1, len(words)), dtype=floatX) for i, word in enumerate(words): examples[:len(word), i] = word all_masks[:len(word), i] = 1. single_space = numpy.array([char_to_ind[' ']])[:, None] for batch in valid_stream.get_epoch_iterator(): for example, mask in equizip(batch[0].T, batch[1].T): example = example[:(mask.sum())] spc_inds = list(numpy.where(example == char_to_ind[" "])[0]) state = generator.transition.transition.initial_states_.get_value()[None, :] for i, j in equizip([-1] + spc_inds, spc_inds + [-1]): word = example[(i+1):j, None] word_cost, states = compute_cost( word, numpy.ones_like(word, dtype=floatX), state) state = states[-1] costs = numpy.exp(-compute_cost( examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0]) _, space_states = compute_cost( single_space, numpy.ones_like(single_space, dtype=floatX), state) state = space_states[-1] word_prob = numpy.exp(-word_cost) total_word_cost += word_cost + numpy.log(numpy.sum(costs)) num_words += 1 print(word_prob) print(numpy.sum(costs)) print("Average cost", total_word_cost / num_words) print("PPL", numpy.exp(total_word_cost / num_words)) print("Word-level perplexity") print(total_word_cost / num_words) else: assert False
TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid", after_epoch=True), Checkpoint("google_Ortho2_pretrain2_l0001.pkl", after_epoch=True), ProgressBar(), Printing() ] #Adding a live plot with the bokeh server extensions.append( Plot('CatsVsDogs160_GoogleNet_Reload2_l0001', channels=[['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], ['train_total_gradient_norm']], after_batch=True)) params = load_parameter_values('GoogleParameters.pkl') model = Model(cost) model.set_parameter_values(params) main_loop = MainLoop(algorithm, data_stream=stream_train, model=model, extensions=extensions) main_loop.run()
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(named_copy(gain_matrix.min(), 'min_gain')) primary_observables.append(named_copy(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = named_copy(recognizer.recordings.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter(applications=[r.bottom.apply], name="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) max_recording_length = named_copy(r.recordings.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = named_copy(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = named_copy(attended.shape[0], "max_attended_length") max_num_phonemes = named_copy(labels.shape[0], "max_num_phonemes") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_attended = named_copy(abs(attended).mean(), "mean_attended") mean_bottom_output = named_copy( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = named_copy(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = named_copy(entropy(weights, labels_mask), "weights_entropy") mask_density = named_copy(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = named_copy(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=SpeechModel( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = named_copy( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = named_copy( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = named_copy( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise param_values = load_parameter_values(params) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations,) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = chars.shape[0].copy(name="max_length") cost_per_character = aggregation.mean( batch_cost, batch_size * max_length).copy( name="character_log_likelihood") min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") mean_activation = abs(activations).mean().copy( name="mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, parameter in parameters.items(): observables.append(parameter.norm(2).copy(name + "_norm")) observables.append(algorithm.gradients[parameter].norm(2).copy( name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_parameter_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( applications=[reverser.generator.generate], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: try: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) except EOFError: break except Exception: traceback.print_exc() continue encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(argv): name = argv[1] files = map(lambda p: join(folder, p), listdir(folder)) file = next(filter(lambda n: name in n, files)) print(file) p = load_parameter_values(file) net = net_dvc((128, 128)) x = tensor.tensor4('image_features') y_hat = net.apply(x) g = Model(y_hat) for k, v in p.items(): p[k] = v.astype('float32') g.set_parameter_values(p) a, t, v = get_dvc((128, 128), trainning=False, shortcut=False) run = function([x], y_hat) def run_test(data): res = [] for i in data.get_epoch_iterator(): res.extend(run(i[0])) return res def max_index(l): if l[0] > l[1]: return 0 else: return 1 def write_kaggle(f, l): f.write("id,label\n") for i, e in enumerate(l, start=1): f.write(str(i) + "," + str(e) + "\n") def kaggle(file, data): write_kaggle(file, map(max_index, run_test(data))) def accuracy(data): res = [] true = [] for i in data.get_epoch_iterator(): res.extend(run(i[0])) true.extend(i[1]) res = map(max_index, res) total = 0 equal = 0 for r, t in zip(res, true): total += 1 equal += 1 if r == t else 0 return equal / total print("Training accuracy: ", accuracy(a)) print("Test accuracy: ", accuracy(v)) kaggle_file = join(result_folder, name + ".kaggle") print(kaggle_file) with open(kaggle_file, 'w') as f: kaggle(f, t)