def setup_encode(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) relative_posns = [T.imatrix() for _ in self.encodings] # dimesions: (batch, time, output_data) encoded_melodies = [T.btensor3() for _ in self.encodings] n_batch, n_time = chord_roots.shape all_activations = [] for encoding, enc_lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.enc_lstmstacks, encoded_melodies, relative_posns): activations = enc_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) , relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_input=encoded_melody, deterministic_dropout=True ) all_activations.append(activations) reduced_activations = functools.reduce((lambda x,y: x+y), all_activations) strengths, vects = self.qman.get_strengths_and_vects(reduced_activations) self.encode_fun = theano.function( inputs=[chord_types, chord_roots] + relative_posns + encoded_melodies, outputs=[strengths, vects], allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def main(config, tr_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] src_vocab = pickle.load(open(config['src_vocab'], 'rb')) logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) # Set up model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # Reload model if necessary extensions = [LoadNMT(config['saveto'])] # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=None, data_stream=None, extensions=extensions ) for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training') char_embedding = encoder.decimator.apply(source_char_seq.T, source_sample_matrix, source_char_aux.T) embedding(Model(char_embedding), src_vocab)
def make_functions( input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs,'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:,-output_length:,:-2] Y_hat = T.nnet.sigmoid(outputs[:,-output_length:,:-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat,Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)"%(time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop( params, grads, learning_rate=1e-4, P=P_learn ) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function( inputs=[input_seqs, output_seqs], outputs=bits_loss ) print "Done. (%0.3f s)"%(time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
def make_functions(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs, 'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:, -output_length:, :-2] Y_hat = T.nnet.sigmoid(outputs[:, -output_length:, :-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat, Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)" % (time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop(params, grads, learning_rate=1e-4, P=P_learn) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function(inputs=[input_seqs, output_seqs], outputs=bits_loss) print "Done. (%0.3f s)" % (time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
def setup_train(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) relative_pos = T.imatrix() # dimesions: (batch, time, output_data) encoded_melody = T.btensor3() # dimesions: (batch, time) correct_notes = T.imatrix() n_batch, n_time = relative_pos.shape def _build(det_dropout): activations = self.lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) , relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, last_output=T.concatenate([T.tile(self.encoding.initial_encoded_form(), (n_batch,1,1)), encoded_melody[:,:-1,:] ], 1), deterministic_dropout=det_dropout) out_probs = self.encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound) return Encoding.compute_loss(out_probs, correct_notes, True) train_loss, train_info = _build(False) updates = Adam(train_loss, self.params, lr=self.learning_rate_var) eval_loss, eval_info = _build(True) self.loss_info_keys = list(train_info.keys()) self.update_fun = theano.function( inputs=[chord_types, chord_roots, relative_pos, encoded_melody, correct_notes], outputs=[train_loss]+list(train_info.values()), updates=updates, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.eval_fun = theano.function( inputs=[chord_types, chord_roots, relative_pos, encoded_melody, correct_notes], outputs=[eval_loss]+list(eval_info.values()), allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def __init__(self, num_chars, char_dim, max_word_len, embed_dim): self.num_chars = num_chars self.char_dim = char_dim self.max_word_len = max_word_len self.embed_dim = embed_dim chars1, chars2 = T.itensor3(), T.itensor3() mask1, mask2 = T.btensor3(), T.btensor3() self.inps = [chars1, chars2, mask1, mask2] l_e1, l_e2 = self.build_network() self.fn = theano.function( self.inps, [L.get_output(l_e1), L.get_output(l_e2)])
def setup_generate(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() n_batch, n_time = chord_roots.shape specs = [lstmstack.prepare_sample_scan( start_pos=T.alloc(np.array(encoding.STARTING_POSITION, np.int32), (n_batch)), start_out=T.tile(encoding.initial_encoded_form(), (n_batch,1)), timestep=T.tile(T.arange(n_time), (n_batch,1)), cur_chord_type=chord_types, cur_chord_root=chord_roots, deterministic_dropout=True ) for lstmstack, encoding in zip(self.lstmstacks, self.encodings)] updates, all_chosen, all_probs, indiv_probs = helper_generate_from_spec(specs, self.lstmstacks, self.encodings, self.srng, n_batch, n_time, self.bounds, self.normalize_artic_only) self.generate_fun = theano.function( inputs=[chord_roots, chord_types], updates=updates, outputs=all_chosen, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.generate_visualize_fun = theano.function( inputs=[chord_roots, chord_types], updates=updates, outputs=[all_chosen, all_probs] + indiv_probs, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def ndim_btensor(ndim, name=None): if ndim == 2: return T.bmatrix(name) elif ndim == 3: return T.btensor3(name) elif ndim == 4: return T.btensor4(name) return T.imatrix(name)
def setup_generate(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() n_batch, n_time = chord_roots.shape spec = self.lstmstack.prepare_sample_scan( start_pos=T.alloc(np.array(self.encoding.STARTING_POSITION, np.int32), (n_batch)), start_out=T.tile(self.encoding.initial_encoded_form(), (n_batch,1)), timestep=T.tile(T.arange(n_time), (n_batch,1)), cur_chord_type=chord_types, cur_chord_root=chord_roots, deterministic_dropout=True ) def _scan_fn(*inputs): # inputs is [ spec_sequences..., last_absolute_position, spec_taps..., spec_non_sequences... ] inputs = list(inputs) last_absolute_chosen = inputs.pop(len(spec.sequences)) scan_rout = self.lstmstack.sample_scan_routine(spec, *inputs) last_rel_pos, last_out, cur_kwargs = scan_rout.send(None) new_pos = self.encoding.get_new_relative_position(last_absolute_chosen, last_rel_pos, last_out, self.bounds.lowbound, self.bounds.highbound, **cur_kwargs) addtl_kwargs = { "last_output": last_out } out_activations = scan_rout.send((new_pos, addtl_kwargs)) out_probs = self.encoding.decode_to_probs(out_activations,new_pos,self.bounds.lowbound, self.bounds.highbound) sampled_note = Encoding.sample_absolute_probs(self.srng, out_probs) encoded_output = self.encoding.note_to_encoding(sampled_note, new_pos, self.bounds.lowbound, self.bounds.highbound) scan_outputs = scan_rout.send(encoded_output) scan_rout.close() return [sampled_note, out_probs] + scan_outputs outputs_info = [{"initial":T.zeros((n_batch,),'int32'), "taps":[-1]}, None] + spec.outputs_info result, updates = theano.scan(fn=_scan_fn, sequences=spec.sequences, non_sequences=spec.non_sequences, outputs_info=outputs_info) all_chosen = result[0].dimshuffle((1,0)) all_probs = result[1].dimshuffle((1,0,2)) self.generate_fun = theano.function( inputs=[chord_roots, chord_types], updates=updates, outputs=all_chosen, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.generate_visualize_fun = theano.function( inputs=[chord_roots, chord_types], updates=updates, outputs=[all_chosen, all_probs], allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def BuildModel(modelSpecs, forTrain=True): rng = np.random.RandomState() ## x is for sequential features and y for matrix (or pairwise) features x = T.tensor3('x') y = T.tensor4('y') ## mask for x and y, respectively xmask = T.bmatrix('xmask') ymask = T.btensor3('ymask') xem = None ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): xem = T.tensor3('xem') distancePredictor = ResNet4DistMatrix( rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, embedInput=xem, modelSpecs=modelSpecs ) else: distancePredictor = ResNet4DistMatrix( rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, modelSpecs=modelSpecs ) ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] ) labelList = [] if forTrain: ## when this model is used for training. We need to define the label variable for response in modelSpecs['responses']: labelType = Response2LabelType(response) rValDims = config.responseValueDims[labelType] if labelType.startswith('Discrete'): if rValDims > 1: ## if one response is a vector, then we use a 4-d tensor ## wtensor is for 16bit integer labelList.append( T.wtensor4('Tlabel4' + response ) ) else: labelList.append( T.wtensor3('Tlabel4' + response ) ) else: if rValDims > 1: labelList.append( T.tensor4('Tlabel4' + response ) ) else: labelList.append( T.tensor3('Tlabel4' + response ) ) ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen) weightList = [] if len(labelList)>0 and modelSpecs['UseSampleWeight']: weightList = [ T.tensor3('Tweight4'+response) for response in modelSpecs['responses'] ] ## for prediction, both labelList and weightList are empty return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList
def setup_encode(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) relative_posns = [T.imatrix() for _ in self.encodings] # dimesions: (batch, time, output_data) encoded_melodies = [T.btensor3() for _ in self.encodings] n_batch, n_time = chord_roots.shape all_activations = [] for encoding, enc_lstmstack, encoded_melody, relative_pos in zip( self.encodings, self.enc_lstmstacks, encoded_melodies, relative_posns): activations = enc_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch, 1)), relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_input=encoded_melody, deterministic_dropout=True) all_activations.append(activations) reduced_activations = functools.reduce((lambda x, y: x + y), all_activations) strengths, vects = self.qman.get_strengths_and_vects( reduced_activations) self.encode_fun = theano.function( inputs=[chord_types, chord_roots] + relative_posns + encoded_melodies, outputs=[strengths, vects], allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def main(config, tr_stream, dev_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() for layer_n in range(config['src_dgru_depth']): encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['bidir_encoder_depth']): encoder.children[ 1 + layer_n].prototype.recurrent.weights_init = Orthogonal() if config['trg_igru_depth'] == 1: decoder.interpolator.igru.weights_init = Orthogonal() else: for layer_n in range(config['trg_igru_depth']): decoder.interpolator.igru.transitions[ layer_n].weights_init = Orthogonal() for layer_n in range(config['trg_dgru_depth']): decoder.interpolator.feedback_brick.dgru.transitions[ layer_n].weights_init = Orthogonal() for layer_n in range(config['transition_depth']): decoder.transition.transitions[layer_n].weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(str(shape), count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(str(value.get_value().shape), name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Set extensions logger.info("Initializing extensions") # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [ train_monitor, Timing(), Printing(every_n_batches=config['print_freq']), FinishAfter(after_n_batches=config['finish_after']), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[config['transition_depth']]) ) # generated[transition_depth] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], transition_depth=config['transition_depth'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def _init_model(self, in_size, out_size, slot_sizes, db, \ n_hid=10, learning_rate_sl=0.005, learning_rate_rl=0.005, batch_size=32, ment=0.1, \ inputtype='full', sl='e2e', rl='e2e'): self.in_size = in_size self.out_size = out_size self.slot_sizes = slot_sizes self.batch_size = batch_size self.learning_rate = learning_rate_rl self.n_hid = n_hid self.r_hid = self.n_hid self.sl = sl self.rl = rl table = db.table counts = db.counts m_unk = [db.inv_counts[s][-1] for s in dialog_config.inform_slots] prior = [db.priors[s] for s in dialog_config.inform_slots] unknown = [db.unks[s] for s in dialog_config.inform_slots] ids = [db.ids[s] for s in dialog_config.inform_slots] input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.bmatrix('tm'), \ T.btensor3('am'), T.fvector('r') T_var, N_var = T.as_tensor_variable(table), T.as_tensor_variable( counts) db_index_var = T.imatrix('db') db_index_switch = T.bvector('s') l_mask_in = L.InputLayer(shape=(None, None), input_var=turn_mask) flat_mask = T.reshape(turn_mask, (turn_mask.shape[0] * turn_mask.shape[1], 1)) def _smooth(p): p_n = p + EPS return p_n / (p_n.sum(axis=1)[:, np.newaxis]) def _add_unk(p, m, N): # p: B x V, m- num missing, N- total, p0: 1 x V t_unk = T.as_tensor_variable(float(m) / N) ps = p * (1. - t_unk) return T.concatenate([ps, T.tile(t_unk, (ps.shape[0], 1))], axis=1) def kl_divergence(p, q): p_n = _smooth(p) return -T.sum(q * T.log(p_n), axis=1) # belief tracking l_in = L.InputLayer(shape=(None, None, self.in_size), input_var=input_var) p_vars = [] pu_vars = [] phi_vars = [] p_targets = [] phi_targets = [] hid_in_vars = [] hid_out_vars = [] bt_loss = T.as_tensor_variable(0.) kl_loss = [] x_loss = [] self.trackers = [] for i, s in enumerate(dialog_config.inform_slots): hid_in = T.fmatrix('h') l_rnn = L.GRULayer(l_in, self.r_hid, hid_init=hid_in, \ mask_input=l_mask_in, grad_clipping=10.) # B x H x D l_b_in = L.ReshapeLayer(l_rnn, (input_var.shape[0] * input_var.shape[1], self.r_hid)) # BH x D hid_out = L.get_output(l_rnn)[:, -1, :] p_targ = T.ftensor3('p_target_' + s) p_t = T.reshape( p_targ, (p_targ.shape[0] * p_targ.shape[1], self.slot_sizes[i])) phi_targ = T.fmatrix('phi_target' + s) phi_t = T.reshape(phi_targ, (phi_targ.shape[0] * phi_targ.shape[1], 1)) l_b = L.DenseLayer(l_b_in, self.slot_sizes[i], nonlinearity=lasagne.nonlinearities.softmax) l_phi = L.DenseLayer(l_b_in, 1, nonlinearity=lasagne.nonlinearities.sigmoid) phi = T.clip(L.get_output(l_phi), 0.01, 0.99) p = L.get_output(l_b) p_u = _add_unk(p, m_unk[i], db.N) kl_loss.append( T.sum(flat_mask.flatten() * kl_divergence(p, p_t)) / T.sum(flat_mask)) x_loss.append( T.sum(flat_mask * lasagne.objectives.binary_crossentropy(phi, phi_t)) / T.sum(flat_mask)) bt_loss += kl_loss[-1] + x_loss[-1] p_vars.append(p) pu_vars.append(p_u) phi_vars.append(phi) p_targets.append(p_targ) phi_targets.append(phi_targ) hid_in_vars.append(hid_in) hid_out_vars.append(hid_out) self.trackers.append(l_b) self.trackers.append(l_phi) self.bt_params = L.get_all_params(self.trackers) def check_db(pv, phi, Tb, N): O = T.alloc(0., pv[0].shape[0], Tb.shape[0]) # BH x T.shape[0] for i, p in enumerate(pv): p_dc = T.tile(phi[i], (1, Tb.shape[0])) O += T.log(p_dc*(1./db.table.shape[0]) + \ (1.-p_dc)*(p[:,Tb[:,i]]/N[np.newaxis,:,i])) Op = T.exp(O) #+EPS # BH x T.shape[0] Os = T.sum(Op, axis=1)[:, np.newaxis] # BH x 1 return Op / Os def entropy(p): p = _smooth(p) return -T.sum(p * T.log(p), axis=-1) def weighted_entropy(p, q, p0, unks, idd): w = T.dot(idd, q.transpose()) # Pi x BH u = p0[np.newaxis, :] * (q[:, unks].sum(axis=1)[:, np.newaxis] ) # BH x Pi p_tilde = w.transpose() + u return entropy(p_tilde) p_db = check_db(pu_vars, phi_vars, T_var, N_var) # BH x T.shape[0] if inputtype == 'entropy': H_vars = [weighted_entropy(pv,p_db,prior[i],unknown[i],ids[i]) \ for i,pv in enumerate(p_vars)] H_db = entropy(p_db) phv = [ph[:, 0] for ph in phi_vars] t_in = T.stacklists(H_vars + phv + [H_db]).transpose() # BH x 2M+1 t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \ t_in.shape[1])) # B x H x 2M+1 l_in_pol = L.InputLayer( shape=(None,None,2*len(dialog_config.inform_slots)+1), \ input_var=t_in_resh) else: in_reshaped = T.reshape(input_var, (input_var.shape[0]*input_var.shape[1], \ input_var.shape[2])) prev_act = in_reshaped[:, -len(dialog_config.inform_slots):] t_in = T.concatenate(pu_vars + phi_vars + [p_db, prev_act], axis=1) # BH x D-sum+A t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \ t_in.shape[1])) # B x H x D-sum l_in_pol = L.InputLayer(shape=(None,None,sum(self.slot_sizes)+ \ 3*len(dialog_config.inform_slots)+ \ table.shape[0]), input_var=t_in_resh) pol_in = T.fmatrix('pol-h') l_pol_rnn = L.GRULayer(l_in_pol, n_hid, hid_init=pol_in, mask_input=l_mask_in, grad_clipping=10.) # B x H x D pol_out = L.get_output(l_pol_rnn)[:, -1, :] l_den_in = L.ReshapeLayer( l_pol_rnn, (turn_mask.shape[0] * turn_mask.shape[1], n_hid)) # BH x D l_out = L.DenseLayer(l_den_in, self.out_size, \ nonlinearity=lasagne.nonlinearities.softmax) # BH x A self.network = l_out self.pol_params = L.get_all_params(self.network) self.params = self.bt_params + self.pol_params # db loss p_db_reshaped = T.reshape( p_db, (turn_mask.shape[0], turn_mask.shape[1], table.shape[0])) p_db_final = p_db_reshaped[:, -1, :] # B x T.shape[0] p_db_final = _smooth(p_db_final) ix = T.tile(T.arange(p_db_final.shape[0]), (db_index_var.shape[1], 1)).transpose() sample_probs = p_db_final[ix, db_index_var] # B x K if dialog_config.SUCCESS_MAX_RANK == 1: log_db_probs = T.log(sample_probs).sum(axis=1) else: cum_probs,_ = theano.scan(fn=lambda x, prev: x+prev, \ outputs_info=T.zeros_like(sample_probs[:,0]), \ sequences=sample_probs[:,:-1].transpose()) cum_probs = T.clip(cum_probs.transpose(), 0., 1. - 1e-5) # B x K-1 log_db_probs = T.log(sample_probs).sum( axis=1) - T.log(1. - cum_probs).sum(axis=1) # B log_db_probs = log_db_probs * db_index_switch # rl probs = L.get_output(self.network) # BH x A probs = _smooth(probs) out_probs = T.reshape(probs, (turn_mask.shape[0], turn_mask.shape[1], self.out_size)) # B x H x A log_probs = T.log(out_probs) act_probs = (log_probs * act_mask).sum(axis=2) # B x H ep_probs = (act_probs * turn_mask).sum(axis=1) # B H_probs = -T.sum(T.sum(out_probs * log_probs, axis=2), axis=1) # B self.act_loss = -T.mean(ep_probs * reward_var) self.db_loss = -T.mean(log_db_probs * reward_var) self.reg_loss = -T.mean(ment * H_probs) self.loss = self.act_loss + self.db_loss + self.reg_loss self.inps = [input_var, turn_mask, act_mask, reward_var, db_index_var, db_index_switch, \ pol_in] + hid_in_vars self.obj_fn = theano.function(self.inps, self.loss, on_unused_input='warn') self.act_fn = theano.function([input_var,turn_mask,pol_in]+hid_in_vars, \ [out_probs,p_db,pol_out]+pu_vars+phi_vars+hid_out_vars, on_unused_input='warn') self.debug_fn = theano.function(self.inps, [probs, p_db, self.loss], on_unused_input='warn') self._rl_train_fn(self.learning_rate) ## sl sl_loss = 0. + bt_loss - T.mean(ep_probs) if self.sl == 'e2e': sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) elif self.sl == 'bel': sl_updates = lasagne.updates.rmsprop(sl_loss, self.bt_params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) else: sl_updates = lasagne.updates.rmsprop(sl_loss, self.pol_params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) sl_inps = [input_var, turn_mask, act_mask, pol_in ] + p_targets + phi_targets + hid_in_vars self.sl_train_fn = theano.function(sl_inps, [sl_loss]+kl_loss+x_loss, updates=sl_updates, \ on_unused_input='warn') self.sl_obj_fn = theano.function(sl_inps, sl_loss, on_unused_input='warn')
def setup_train(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) relative_posns = [T.imatrix() for _ in self.encodings] # dimesions: (batch, time, output_data) encoded_melodies = [T.btensor3() for _ in self.encodings] # dimesions: (batch, time) correct_notes = T.imatrix() n_batch, n_time = chord_roots.shape def _build(det_dropout): all_out_probs = [] for encoding, lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.lstmstacks, encoded_melodies, relative_posns): activations = lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) , relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, last_output=T.concatenate([T.tile(encoding.initial_encoded_form(), (n_batch,1,1)), encoded_melody[:,:-1,:] ], 1), deterministic_dropout=det_dropout) out_probs = encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound) all_out_probs.append(out_probs) reduced_out_probs = functools.reduce((lambda x,y: x*y), all_out_probs) if self.normalize_artic_only: non_artic_probs = reduced_out_probs[:,:,:2] artic_probs = reduced_out_probs[:,:,2:] non_artic_sum = T.sum(non_artic_probs, 2, keepdims=True) artic_sum = T.sum(artic_probs, 2, keepdims=True) norm_artic_probs = artic_probs*(1-non_artic_sum)/artic_sum norm_out_probs = T.concatenate([non_artic_probs, norm_artic_probs], 2) else: normsum = T.sum(reduced_out_probs, 2, keepdims=True) normsum = T.maximum(normsum, constants.EPSILON) norm_out_probs = reduced_out_probs/normsum return Encoding.compute_loss(norm_out_probs, correct_notes, True) train_loss, train_info = _build(False) updates = Adam(train_loss, self.get_optimize_params(), lr=self.learning_rate_var) eval_loss, eval_info = _build(True) self.loss_info_keys = list(train_info.keys()) self.update_fun = theano.function( inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies, outputs=[train_loss]+list(train_info.values()), updates=updates, allow_input_downcast=True, on_unused_input='ignore', mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.eval_fun = theano.function( inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies, outputs=[eval_loss]+list(eval_info.values()), allow_input_downcast=True, on_unused_input='ignore', mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def _init_model(self, in_size, out_size, slot_sizes, db, \ n_hid=10, learning_rate_sl=0.005, learning_rate_rl=0.005, batch_size=32, ment=0.1, \ inputtype='full', sl='e2e', rl='e2e'): self.in_size = in_size self.out_size = out_size self.slot_sizes = slot_sizes self.batch_size = batch_size self.learning_rate = learning_rate_rl self.n_hid = n_hid self.r_hid = self.n_hid self.sl = sl self.rl = rl table = db.table counts = db.counts m_unk = [db.inv_counts[s][-1] for s in dialog_config.inform_slots] prior = [db.priors[s] for s in dialog_config.inform_slots] unknown = [db.unks[s] for s in dialog_config.inform_slots] ids = [db.ids[s] for s in dialog_config.inform_slots] input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.bmatrix('tm'), \ T.btensor3('am'), T.fvector('r') T_var, N_var = T.as_tensor_variable(table), T.as_tensor_variable(counts) db_index_var = T.imatrix('db') db_index_switch = T.bvector('s') l_mask_in = L.InputLayer(shape=(None,None), input_var=turn_mask) flat_mask = T.reshape(turn_mask, (turn_mask.shape[0]*turn_mask.shape[1],1)) def _smooth(p): p_n = p+EPS return p_n/(p_n.sum(axis=1)[:,np.newaxis]) def _add_unk(p,m,N): # p: B x V, m- num missing, N- total, p0: 1 x V t_unk = T.as_tensor_variable(float(m)/N) ps = p*(1.-t_unk) return T.concatenate([ps, T.tile(t_unk, (ps.shape[0],1))], axis=1) def kl_divergence(p,q): p_n = _smooth(p) return -T.sum(q*T.log(p_n), axis=1) # belief tracking l_in = L.InputLayer(shape=(None,None,self.in_size), input_var=input_var) p_vars = [] pu_vars = [] phi_vars = [] p_targets = [] phi_targets = [] hid_in_vars = [] hid_out_vars = [] bt_loss = T.as_tensor_variable(0.) kl_loss = [] x_loss = [] self.trackers = [] for i,s in enumerate(dialog_config.inform_slots): hid_in = T.fmatrix('h') l_rnn = L.GRULayer(l_in, self.r_hid, hid_init=hid_in, \ mask_input=l_mask_in, grad_clipping=10.) # B x H x D l_b_in = L.ReshapeLayer(l_rnn, (input_var.shape[0]*input_var.shape[1], self.r_hid)) # BH x D hid_out = L.get_output(l_rnn)[:,-1,:] p_targ = T.ftensor3('p_target_'+s) p_t = T.reshape(p_targ, (p_targ.shape[0]*p_targ.shape[1],self.slot_sizes[i])) phi_targ = T.fmatrix('phi_target'+s) phi_t = T.reshape(phi_targ, (phi_targ.shape[0]*phi_targ.shape[1], 1)) l_b = L.DenseLayer(l_b_in, self.slot_sizes[i], nonlinearity=lasagne.nonlinearities.softmax) l_phi = L.DenseLayer(l_b_in, 1, nonlinearity=lasagne.nonlinearities.sigmoid) phi = T.clip(L.get_output(l_phi), 0.01, 0.99) p = L.get_output(l_b) p_u = _add_unk(p, m_unk[i], db.N) kl_loss.append(T.sum(flat_mask.flatten()*kl_divergence(p, p_t))/T.sum(flat_mask)) x_loss.append(T.sum(flat_mask*lasagne.objectives.binary_crossentropy(phi,phi_t))/ T.sum(flat_mask)) bt_loss += kl_loss[-1] + x_loss[-1] p_vars.append(p) pu_vars.append(p_u) phi_vars.append(phi) p_targets.append(p_targ) phi_targets.append(phi_targ) hid_in_vars.append(hid_in) hid_out_vars.append(hid_out) self.trackers.append(l_b) self.trackers.append(l_phi) self.bt_params = L.get_all_params(self.trackers) def check_db(pv, phi, Tb, N): O = T.alloc(0.,pv[0].shape[0],Tb.shape[0]) # BH x T.shape[0] for i,p in enumerate(pv): p_dc = T.tile(phi[i], (1, Tb.shape[0])) O += T.log(p_dc*(1./db.table.shape[0]) + \ (1.-p_dc)*(p[:,Tb[:,i]]/N[np.newaxis,:,i])) Op = T.exp(O)#+EPS # BH x T.shape[0] Os = T.sum(Op, axis=1)[:,np.newaxis] # BH x 1 return Op/Os def entropy(p): p = _smooth(p) return -T.sum(p*T.log(p), axis=-1) def weighted_entropy(p,q,p0,unks,idd): w = T.dot(idd,q.transpose()) # Pi x BH u = p0[np.newaxis,:]*(q[:,unks].sum(axis=1)[:,np.newaxis]) # BH x Pi p_tilde = w.transpose()+u return entropy(p_tilde) p_db = check_db(pu_vars, phi_vars, T_var, N_var) # BH x T.shape[0] if inputtype=='entropy': H_vars = [weighted_entropy(pv,p_db,prior[i],unknown[i],ids[i]) \ for i,pv in enumerate(p_vars)] H_db = entropy(p_db) phv = [ph[:,0] for ph in phi_vars] t_in = T.stacklists(H_vars+phv+[H_db]).transpose() # BH x 2M+1 t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \ t_in.shape[1])) # B x H x 2M+1 l_in_pol = L.InputLayer( shape=(None,None,2*len(dialog_config.inform_slots)+1), \ input_var=t_in_resh) else: in_reshaped = T.reshape(input_var, (input_var.shape[0]*input_var.shape[1], \ input_var.shape[2])) prev_act = in_reshaped[:,-len(dialog_config.inform_slots):] t_in = T.concatenate(pu_vars+phi_vars+[p_db,prev_act], axis=1) # BH x D-sum+A t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \ t_in.shape[1])) # B x H x D-sum l_in_pol = L.InputLayer(shape=(None,None,sum(self.slot_sizes)+ \ 3*len(dialog_config.inform_slots)+ \ table.shape[0]), input_var=t_in_resh) pol_in = T.fmatrix('pol-h') l_pol_rnn = L.GRULayer(l_in_pol, n_hid, hid_init=pol_in, mask_input=l_mask_in, grad_clipping=10.) # B x H x D pol_out = L.get_output(l_pol_rnn)[:,-1,:] l_den_in = L.ReshapeLayer(l_pol_rnn, (turn_mask.shape[0]*turn_mask.shape[1], n_hid)) # BH x D l_out = L.DenseLayer(l_den_in, self.out_size, \ nonlinearity=lasagne.nonlinearities.softmax) # BH x A self.network = l_out self.pol_params = L.get_all_params(self.network) self.params = self.bt_params + self.pol_params # db loss p_db_reshaped = T.reshape(p_db, (turn_mask.shape[0],turn_mask.shape[1],table.shape[0])) p_db_final = p_db_reshaped[:,-1,:] # B x T.shape[0] p_db_final = _smooth(p_db_final) ix = T.tile(T.arange(p_db_final.shape[0]),(db_index_var.shape[1],1)).transpose() sample_probs = p_db_final[ix,db_index_var] # B x K if dialog_config.SUCCESS_MAX_RANK==1: log_db_probs = T.log(sample_probs).sum(axis=1) else: cum_probs,_ = theano.scan(fn=lambda x, prev: x+prev, \ outputs_info=T.zeros_like(sample_probs[:,0]), \ sequences=sample_probs[:,:-1].transpose()) cum_probs = T.clip(cum_probs.transpose(), 0., 1.-1e-5) # B x K-1 log_db_probs = T.log(sample_probs).sum(axis=1) - T.log(1.-cum_probs).sum(axis=1) # B log_db_probs = log_db_probs * db_index_switch # rl probs = L.get_output(self.network) # BH x A probs = _smooth(probs) out_probs = T.reshape(probs, (turn_mask.shape[0],turn_mask.shape[1],self.out_size)) # B x H x A log_probs = T.log(out_probs) act_probs = (log_probs*act_mask).sum(axis=2) # B x H ep_probs = (act_probs*turn_mask).sum(axis=1) # B H_probs = -T.sum(T.sum(out_probs*log_probs,axis=2),axis=1) # B self.act_loss = -T.mean(ep_probs*reward_var) self.db_loss = -T.mean(log_db_probs*reward_var) self.reg_loss = -T.mean(ment*H_probs) self.loss = self.act_loss + self.db_loss + self.reg_loss self.inps = [input_var, turn_mask, act_mask, reward_var, db_index_var, db_index_switch, \ pol_in] + hid_in_vars self.obj_fn = theano.function(self.inps, self.loss, on_unused_input='warn') self.act_fn = theano.function([input_var,turn_mask,pol_in]+hid_in_vars, \ [out_probs,p_db,pol_out]+pu_vars+phi_vars+hid_out_vars, on_unused_input='warn') self.debug_fn = theano.function(self.inps, [probs, p_db, self.loss], on_unused_input='warn') self._rl_train_fn(self.learning_rate) ## sl sl_loss = 0. + bt_loss - T.mean(ep_probs) if self.sl=='e2e': sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) elif self.sl=='bel': sl_updates = lasagne.updates.rmsprop(sl_loss, self.bt_params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) else: sl_updates = lasagne.updates.rmsprop(sl_loss, self.pol_params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) sl_inps = [input_var, turn_mask, act_mask, pol_in] + p_targets + phi_targets + hid_in_vars self.sl_train_fn = theano.function(sl_inps, [sl_loss]+kl_loss+x_loss, updates=sl_updates, \ on_unused_input='warn') self.sl_obj_fn = theano.function(sl_inps, sl_loss, on_unused_input='warn')
def build_network(): from lasagne.layers import InputLayer, LSTMLayer, ConcatLayer, ReshapeLayer, DenseLayer, get_output, get_all_params from lasagne.objectives import categorical_crossentropy print("Building network ...") # inputs ############################################### l_in_x = InputLayer(shape=(BATCH_SIZE, None, vocab_size)) l_in_y = InputLayer(shape=(BATCH_SIZE, None, vocab_size)) # encoder ############################################### l_enc = LSTMLayer( l_in_x, N_HIDDEN, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, only_return_final=True) # decoder ############################################### l_repeated_enc = Repeat(l_enc, SEQ_LENGTH) l_conc = ConcatLayer([l_in_y, l_repeated_enc], axis=2) l_dec = LSTMLayer( l_conc, N_HIDDEN, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh) # output ############################################### l_dec_long = ReshapeLayer(l_dec, shape=(-1, N_HIDDEN)) l_dist = DenseLayer( l_dec_long, num_units=vocab_size, nonlinearity=lasagne.nonlinearities.softmax) l_out = ReshapeLayer(l_dist, shape=(BATCH_SIZE, -1, vocab_size)) # print(lasagne.layers.get_output_shape(l_out)) # compilations ############################################### target_values = T.btensor3('target_output') network_output = get_output(l_out) cost = categorical_crossentropy(network_output, target_values).mean() all_params = get_all_params(l_out,trainable=True) print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function( inputs=[l_in_x.input_var, l_in_y.input_var, target_values], outputs=cost, updates=updates, allow_input_downcast=True) compute_cost = theano.function( inputs=[l_in_x.input_var, target_values], outputs=cost, allow_input_downcast=True) predict = theano.function( inputs=[l_in_x.input_var], outputs=network_output, allow_input_downcast=True) return train, predict, compute_cost
def setup_train(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) relative_posns = [T.imatrix() for _ in self.encodings] # dimesions: (batch, time, output_data) encoded_melodies = [T.btensor3() for _ in self.encodings] # dimesions: (batch, time) correct_notes = T.imatrix() n_batch, n_time = chord_roots.shape def _build(det_dropout): all_activations = [] for encoding, enc_lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.enc_lstmstacks, encoded_melodies, relative_posns): activations = enc_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) , relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_input=encoded_melody, deterministic_dropout=det_dropout) all_activations.append(activations) reduced_activations = functools.reduce((lambda x,y: x+y), all_activations) queue_loss, feat_strengths, feat_vects, queue_info = self.qman.process(reduced_activations, extra_info=True) features = QueueManager.queue_transform(feat_strengths, feat_vects) all_out_probs = [] for encoding, dec_lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.dec_lstmstacks, encoded_melodies, relative_posns): activations = dec_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) , relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_feature=features, last_output=T.concatenate([T.tile(encoding.initial_encoded_form(), (n_batch,1,1)), encoded_melody[:,:-1,:] ], 1), deterministic_dropout=det_dropout) out_probs = encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound) all_out_probs.append(out_probs) reduced_out_probs = functools.reduce((lambda x,y: x*y), all_out_probs) normsum = T.sum(reduced_out_probs, 2, keepdims=True) normsum = T.maximum(normsum, constants.EPSILON) norm_out_probs = reduced_out_probs/normsum reconstruction_loss, reconstruction_info = Encoding.compute_loss(norm_out_probs, correct_notes, extra_info=True) queue_surrogate_loss_parts = self.qman.surrogate_loss(reconstruction_loss, queue_info) updates = [] full_info = queue_info.copy() full_info.update(reconstruction_info) full_info["queue_loss"] = queue_loss full_info["reconstruction_loss"] = reconstruction_loss float_n_batch = T.cast(n_batch,'float32') if self.loss_mode is "add": full_loss = queue_loss + reconstruction_loss elif self.loss_mode is "priority": curviness = np.array(self.loss_mode_params[0], np.float32)*float_n_batch # ln( e^x + e^y - 1 ) # ln( C(e^x + e^y - 1) ) - ln(C) # ln( e^c(e^x + e^y - 1) ) - c # ln( e^(x+c) + e^(y+c) - e^c ) - c # ln( e^(x-c) + e^(y-c) - e^(-c) ) + c # Now let c = maximum(x,y), d = minimum(x,y). WOLOG replace x=c, y=d # ln( e^(c-c) + e^(d-c) - e^(-c) ) + c # ln( 1 + e^(d-c) - e^(-c) ) + c x = reconstruction_loss/curviness y = queue_loss/curviness c = T.maximum(x,y) d = T.minimum(x,y) full_loss = (T.log( 1 + T.exp(d-c) - T.exp(-c)) + c)*curviness elif self.loss_mode is "cutoff": cutoff_val = np.array(self.loss_mode_params[0], np.float32) full_loss = T.switch(reconstruction_loss<cutoff_val*float_n_batch, reconstruction_loss+queue_loss, reconstruction_loss) elif self.loss_mode is "trigger": trigger_val = np.array(self.loss_mode_params[0], np.float32) trigger_speed = np.array(1.0/self.loss_mode_params[1], np.float32) trigger_is_on = theano.shared(np.array(0, np.int8)) trigger_scale = theano.shared(np.array(0.0, np.float32)) full_loss = reconstruction_loss + trigger_scale * queue_loss updates.append((trigger_is_on, T.or_(trigger_is_on, reconstruction_loss<trigger_val*float_n_batch))) updates.append((trigger_scale, T.switch(trigger_is_on, T.minimum(trigger_scale + trigger_speed, np.array(1.0,np.float32)), np.array(0.0,np.float32)))) full_info["trigger_scale"] = trigger_scale if queue_surrogate_loss_parts is not None: surrogate_loss, addtl_updates = queue_surrogate_loss_parts full_loss = full_loss + surrogate_loss updates.extend(addtl_updates) full_info["surrogate_loss"] = surrogate_loss return full_loss, full_info, updates train_loss, train_info, train_updates = _build(False) if self.train_decoder_only: params = list(itertools.chain(*(lstmstack.params for lstmstack in self.dec_lstmstacks))) else: params = self.params adam_updates = Adam(train_loss, params, lr=self.learning_rate_var) eval_loss, eval_info, _ = _build(True) self.loss_info_keys = list(train_info.keys()) self.update_fun = theano.function( inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies, outputs=[train_loss]+list(train_info.values()), updates=train_updates+adam_updates, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.eval_fun = theano.function( inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies, outputs=[eval_loss]+list(eval_info.values()), allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def setup_train(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) relative_posns = [T.imatrix() for _ in self.encodings] # dimesions: (batch, time, output_data) encoded_melodies = [T.btensor3() for _ in self.encodings] # dimesions: (batch, time) correct_notes = T.imatrix() n_batch, n_time = chord_roots.shape def _build(det_dropout): all_activations = [] for encoding, enc_lstmstack, encoded_melody, relative_pos in zip( self.encodings, self.enc_lstmstacks, encoded_melodies, relative_posns): activations = enc_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch, 1)), relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_input=encoded_melody, deterministic_dropout=det_dropout) all_activations.append(activations) reduced_activations = functools.reduce((lambda x, y: x + y), all_activations) queue_loss, feat_strengths, feat_vects, queue_info = self.qman.process( reduced_activations, extra_info=True) features = QueueManager.queue_transform(feat_strengths, feat_vects) all_out_probs = [] for encoding, dec_lstmstack, encoded_melody, relative_pos in zip( self.encodings, self.dec_lstmstacks, encoded_melodies, relative_posns): activations = dec_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch, 1)), relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_feature=features, last_output=T.concatenate([ T.tile(encoding.initial_encoded_form(), (n_batch, 1, 1)), encoded_melody[:, :-1, :] ], 1), deterministic_dropout=det_dropout) out_probs = encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound) all_out_probs.append(out_probs) reduced_out_probs = functools.reduce((lambda x, y: x * y), all_out_probs) normsum = T.sum(reduced_out_probs, 2, keepdims=True) normsum = T.maximum(normsum, constants.EPSILON) norm_out_probs = reduced_out_probs / normsum reconstruction_loss, reconstruction_info = Encoding.compute_loss( norm_out_probs, correct_notes, extra_info=True) queue_surrogate_loss_parts = self.qman.surrogate_loss( reconstruction_loss, queue_info) updates = [] full_info = queue_info.copy() full_info.update(reconstruction_info) full_info["queue_loss"] = queue_loss full_info["reconstruction_loss"] = reconstruction_loss float_n_batch = T.cast(n_batch, 'float32') if self.loss_mode is "add": full_loss = queue_loss + reconstruction_loss elif self.loss_mode is "priority": curviness = np.array(self.loss_mode_params[0], np.float32) * float_n_batch # ln( e^x + e^y - 1 ) # ln( C(e^x + e^y - 1) ) - ln(C) # ln( e^c(e^x + e^y - 1) ) - c # ln( e^(x+c) + e^(y+c) - e^c ) - c # ln( e^(x-c) + e^(y-c) - e^(-c) ) + c # Now let c = maximum(x,y), d = minimum(x,y). WOLOG replace x=c, y=d # ln( e^(c-c) + e^(d-c) - e^(-c) ) + c # ln( 1 + e^(d-c) - e^(-c) ) + c x = reconstruction_loss / curviness y = queue_loss / curviness c = T.maximum(x, y) d = T.minimum(x, y) full_loss = (T.log(1 + T.exp(d - c) - T.exp(-c)) + c) * curviness elif self.loss_mode is "cutoff": cutoff_val = np.array(self.loss_mode_params[0], np.float32) full_loss = T.switch( reconstruction_loss < cutoff_val * float_n_batch, reconstruction_loss + queue_loss, reconstruction_loss) elif self.loss_mode is "trigger": trigger_val = np.array(self.loss_mode_params[0], np.float32) trigger_speed = np.array(1.0 / self.loss_mode_params[1], np.float32) trigger_is_on = theano.shared(np.array(0, np.int8)) trigger_scale = theano.shared(np.array(0.0, np.float32)) full_loss = reconstruction_loss + trigger_scale * queue_loss updates.append( (trigger_is_on, T.or_(trigger_is_on, reconstruction_loss < trigger_val * float_n_batch))) updates.append((trigger_scale, T.switch( trigger_is_on, T.minimum(trigger_scale + trigger_speed, np.array(1.0, np.float32)), np.array(0.0, np.float32)))) full_info["trigger_scale"] = trigger_scale if queue_surrogate_loss_parts is not None: surrogate_loss, addtl_updates = queue_surrogate_loss_parts full_loss = full_loss + surrogate_loss updates.extend(addtl_updates) full_info["surrogate_loss"] = surrogate_loss return full_loss, full_info, updates train_loss, train_info, train_updates = _build(False) if self.train_decoder_only: params = list( itertools.chain(*(lstmstack.params for lstmstack in self.dec_lstmstacks))) else: params = self.params adam_updates = Adam(train_loss, params, lr=self.learning_rate_var) eval_loss, eval_info, _ = _build(True) self.loss_info_keys = list(train_info.keys()) self.update_fun = theano.function( inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies, outputs=[train_loss] + list(train_info.values()), updates=train_updates + adam_updates, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.eval_fun = theano.function( inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies, outputs=[eval_loss] + list(eval_info.values()), allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def TestResNet4DistMatrix(): x = T.tensor3('x') y = T.tensor4('y') xmask = T.bmatrix('xmask') ymask = T.btensor3('ymask') selection = T.wtensor3('selection') import cPickle fh = open('seqDataset4HF.pkl') data = cPickle.load(fh) fh.close() distancePredictor = ResNet4DistMatrix(rng=np.random.RandomState(), seqInput=x, matrixInput=y, n_in_seq=data[0][0].shape[2], n_in_matrix=data[1][0].shape[3], n_hiddens_seq=[3, 5], n_hiddens_matrix=[2], hwsz_seq=4, hwsz_matrix=4, mask_seq=xmask, mask_matrix=ymask) """ f = theano.function([x, y, xmask, ymask], distancePredictor.output_1d) g = theano.function([x, y, xmask, ymask], distancePredictor.output_2d) """ dataLen = 300 batchSize = 60 a = np.random.uniform(0, 1, (batchSize, dataLen, 20)).astype(np.float32) b = np.random.uniform(0, 1, (batchSize, dataLen, dataLen, 3)).astype(np.float32) amask = np.zeros((batchSize, 0)).astype(np.int8) bmask = np.zeros((batchSize, 0, dataLen)).astype(np.int8) sel = np.ones((batchSize, dataLen, dataLen)).astype(np.int8) #print a #print b c = np.random.uniform(0, 3, (batchSize, dataLen, dataLen)).round().astype( np.int8) np.putmask(c, c >= 2, 2) """ c[0, 1, 13]=1 c[0, 2, 15]=1 c[0, 4, 16]=1 c[0, 1, 27]=1 c[0, 2, 28]=1 c[0, 4, 29]=1 c[1, 0, 13]=2 c[1, 1, 15]=2 c[1, 3, 16]=2 c[2, 0, 23]=2 c[2, 1, 25]=2 c[2, 3, 26]=2 """ #sel = c #out1d = f(a, b, amask, bmask) #out2d = g(a, b, amask, bmask) #print out1d #print out2d z = T.btensor3('z') loss = distancePredictor.loss(z, selection) errs = distancePredictor.ErrorsByRange(z) accs = distancePredictor.TopAccuracyByRange(z) confM = distancePredictor.confusionMatrix(z) h = theano.function([x, y, xmask, ymask, selection, z], confM, on_unused_input='ignore') #l, e, accu = h(a, b, amask, bmask, sel, c) cms = [] for i in np.arange(5): cm = h(data[0][i], data[1][i], data[2][i], data[3][i], data[4][i], data[5][i]) print(cm) cms.append(cm) sumofcms = np.sum(cms, axis=0) * 1. for i in range(sumofcms.shape[0]): sumofcms[i] /= np.sum(sumofcms[i]) confusions = sumofcms print(confusions) print(np.sum(confusions[0])) print(np.sum(confusions[1])) print(np.sum(confusions[2])) """
def main(config, test_stream, testing_model): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = test_stream.trg_bos target_space_idx = test_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # Extensions extensions = [] # Reload model if necessary if config['reload']: extensions.append(LoadNMT(testing_model)) # Set up beam search and sampling computation graphs if necessary if config['bleu_script'] is not None: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[config['transition_depth']])) # generated[config['transition_depth']] is next_outputs logger.info("Building bleu tester") extensions.append( BleuTester(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples=samples, config=config, model=search_model, data_stream=test_stream, testing_model=testing_model, normalize=config['normalized_bleu'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=None, data_stream=None, extensions=extensions) for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training')
def build_network(): from lasagne.layers import InputLayer, LSTMLayer, ConcatLayer, ReshapeLayer, DenseLayer, get_output, get_all_params from lasagne.objectives import categorical_crossentropy print("Building network ...") # inputs ############################################### l_in_x = InputLayer(shape=(BATCH_SIZE, None, vocab_size)) l_in_y = InputLayer(shape=(BATCH_SIZE, None, vocab_size)) # encoder ############################################### l_enc = LSTMLayer(l_in_x, N_HIDDEN, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, only_return_final=True) # decoder ############################################### l_repeated_enc = Repeat(l_enc, SEQ_LENGTH) l_conc = ConcatLayer([l_in_y, l_repeated_enc], axis=2) l_dec = LSTMLayer(l_conc, N_HIDDEN, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh) # output ############################################### l_dec_long = ReshapeLayer(l_dec, shape=(-1, N_HIDDEN)) l_dist = DenseLayer(l_dec_long, num_units=vocab_size, nonlinearity=lasagne.nonlinearities.softmax) l_out = ReshapeLayer(l_dist, shape=(BATCH_SIZE, -1, vocab_size)) # print(lasagne.layers.get_output_shape(l_out)) # compilations ############################################### target_values = T.btensor3('target_output') network_output = get_output(l_out) cost = categorical_crossentropy(network_output, target_values).mean() all_params = get_all_params(l_out, trainable=True) print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function( inputs=[l_in_x.input_var, l_in_y.input_var, target_values], outputs=cost, updates=updates, allow_input_downcast=True) compute_cost = theano.function(inputs=[l_in_x.input_var, target_values], outputs=cost, allow_input_downcast=True) predict = theano.function(inputs=[l_in_x.input_var], outputs=network_output, allow_input_downcast=True) return train, predict, compute_cost
def model_setup(self, mfile=None, num_units1=128, num_units2=128, lrate=2e-3, drate=0.95, eps=1e-8, bptt_maxdepth=50, l1=0, l2=0, char_dim=None): """initialization of the 2-layer LSTM model for learning or for the generation of sequences""" # the default parameters are identical to Andrej Karpathy's # (see https://github.com/karpathy/char-rnn) # 2-layer LSTM parameters self.p = {'U1': None, 'W1': None, 'b1': None, 'U2': None, 'W2': None, 'b2': None, 'V': None, 'c': None} # learning parameters self.lp = {'lrate': lrate, # learning rate 'drate': drate, # decay rate for rmsprop 'eps': eps, # epsilon parameter for rmsprop 'bptt_maxdepth': bptt_maxdepth, # backpropagation cutoff 'l1': l1, # L1 regularization parameter 'l2': l2 # L2 regularization parameter } if mfile is not None: # loading parameters from an npz file np_init = self.load_params(mfile) num_units1 = np_init['b1'].shape[1] num_units2 = np_init['b2'].shape[1] else: if char_dim is None: if self.uchar: char_dim = len(self.uchar) else: raise Exception('prepare_input() should be run before ' + 'model_setup() unless mfile is provided') # initialize small random weights r_char_dim = np.sqrt(1./(char_dim)) r_units1 = np.sqrt(1./(num_units1)) r_units2 = np.sqrt(1./(num_units2)) def uniform(rng, shape): return np.random.uniform(-rng, rng, shape).astype(theano.config.floatX) def randn(rng, shape): return np.random.uniform(-rng, rng, shape).astype(theano.config.floatX) def bias_hack(num_units): b = np.zeros((4, num_units)) b[0] = 1. # forget gate hack # helps the network remember information return b.astype(theano.config.floatX) def zeros(shape): return np.zeros(shape).astype(theano.config.floatX) def ones(shape): return np.ones(shape).astype(theano.config.floatX) # parameters for the gates # [0]: forget # [1]: input # [2]: output # [3]: cell state update np_init = {} # first layer np_init['U1'] = uniform(r_char_dim, (4, num_units1, char_dim)) np_init['W1'] = uniform(r_units1, (4, num_units1, num_units1)) np_init['b1'] = bias_hack(num_units1) # second layer np_init['U2'] = uniform(r_units1, (4, num_units2, num_units1)) np_init['W2'] = uniform(r_units2, (4, num_units2, num_units2)) np_init['b2'] = bias_hack(num_units2) # parameters for the last layer (cell output -> network output) np_init['V'] = uniform(r_units2, (char_dim, num_units2)) np_init['c'] = zeros(char_dim) # dynamical learning rate (in case the user wants to modify it # during the learning process) if theano.config.floatX == 'float32': dyn_lrate_init = np.float32(self.lp['lrate']) else: dyn_lrate_init = np.float64(self.lp['lrate']) self.dyn_lrate = theano.shared(dyn_lrate_init, name='dyn_lrate') # parameters for rmsprop (running average of gradients) msq_g = {} for param in self.p: msq_g[param] = theano.shared(zeros(np_init[param].shape), name='msq_g'+param) for param in self.p: self.p[param] = theano.shared(np_init[param], name=param) if self.batch_size > 1: x = T.imatrix('x') y = T.btensor3('y') else: x = T.ivector('x') y = T.bmatrix('y') def forward_prop(x, ht1m1, Ct1m1, ht2m1, Ct2m1, U1, W1, b1, U2, W2, b2, V, c): # defines each time step of the RNN model if self.batch_size > 1: # transform into column vectors col_b1 = b1.dimshuffle((0,1,'x')) col_b2 = b2.dimshuffle((0,1,'x')) col_c = c.dimshuffle((0,'x')) else: col_b1 = b1 col_b2 = b2 col_c = c # layer 1 gates1 = [] for i in xrange(3): # forget, input and output gates gates1.append(T.nnet.sigmoid(U1[i][:,x] + W1[i].dot(ht1m1) + col_b1[i])) tentative_Ct1 = T.tanh(U1[3][:,x] + W1[3].dot(ht1m1) + col_b1[3]) Ct1 = Ct1m1 * gates1[0] + tentative_Ct1 * gates1[1] ht1 = gates1[2] * T.tanh(Ct1) # layer 2 gates2 = [] for i in xrange(3): # forget, input and output gates gates2.append(T.nnet.sigmoid(U2[i].dot(ht1) + W2[i].dot(ht2m1) + col_b2[i])) tentative_Ct2 = T.tanh(U2[3].dot(ht1) + W2[3].dot(ht2m1) + col_b2[3]) Ct2 = Ct2m1 * gates2[0] + tentative_Ct2 * gates2[1] ht2 = gates2[2] * T.tanh(Ct2) # final layer o = T.nnet.softmax((V.dot(ht2) + col_c).T) return [o, ht1, Ct1, ht2, Ct2] if self.batch_size > 1: ht1_Ct1_size = (num_units1, self.batch_size) ht2_Ct2_size = (num_units2, self.batch_size) else: ht1_Ct1_size = num_units1 ht2_Ct2_size = num_units2 [o, ht1, Ct1, ht2, Ct2], updates = theano.scan( fn=forward_prop, sequences=x, outputs_info=[None, T.zeros(ht1_Ct1_size), T.zeros(ht1_Ct1_size), T.zeros(ht2_Ct2_size), T.zeros(ht2_Ct2_size) ], non_sequences=[self.p['U1'], self.p['W1'], self.p['b1'], self.p['U2'], self.p['W2'], self.p['b2'], self.p['V'], self.p['c']], truncate_gradient=self.lp['bptt_maxdepth'], strict=True) # o is a (seq_len, batch_size, char_dim) tensor---even if batch_size=1 prediction = T.argmax(o, axis=2) self.theano_predict = theano.function( inputs=[x], outputs=[o, prediction], ) if mfile is not None: # not here for learning; we can stop here return # compute the cross-entropy loss xent = (-y*T.log(o)).sum(axis=2) # (string_len, batch_size) matrix cost = T.mean(xent) # regularization using L1 and/or L2 norms reg_cost = cost # cast into theano.config.floatX is a trick to avoid float64 below tot_shape = (xent.shape[0] * xent.shape[1]).astype(theano.config.floatX) for param in self.p: if l1 > 0: # L1 regularization reg_cost += l1 * T.sum(abs(self.p[param])) / tot_shape if l2 > 0: # L2 regularization reg_cost += l2 * T.sum(self.p[param] ** 2) / tot_shape g = {} for param in self.p: g[param] = T.grad(reg_cost, self.p[param]) # for rmsprop new_msq_g = {} updates = {} rmsprop_updates = [] sgd_updates = [] ratios = {} for param in self.p: new_msq_g[param] = (self.lp['drate'] * msq_g[param] + (1. - self.lp['drate']) * g[param]**2) updates[param] = (self.dyn_lrate * g[param] / (T.sqrt(new_msq_g[param]) + self.lp['eps'])) # update to parameter scale ratio ratios[param] = (T.flatten(updates[param]).norm(2) / T.flatten(self.p[param]).norm(2)) sgd_updates.append((self.p[param], self.p[param] - self.dyn_lrate * g[param])) rmsprop_updates.append((self.p[param], self.p[param] - updates[param])) rmsprop_updates.append((msq_g[param], new_msq_g[param])) # todo: add possibility to clip gradients to some value f_out = [cost, prediction] # compute cost and prediction but do not update the weights self.theano_check = theano.function( inputs=[x, y], outputs=f_out, ) f_out.extend([ratios['U1'], ratios['W1'], ratios['b1'], ratios['U2'], ratios['W2'], ratios['b2'], ratios['V'], ratios['c']]) # mini-batch training with rmsprop self.theano_train_rmsprop = theano.function( inputs=[x, y], outputs=f_out, updates=rmsprop_updates ) # mini-batch training with stochastic gradient descent self.theano_train_sgd = theano.function( inputs=[x, y], outputs=f_out, updates=sgd_updates )
def setup_generate(self): print('{:25}'.format("Setup Generate"), end='', flush=True) self.generate_seed_input = T.btensor3() self.steps_to_simulate = T.iscalar() def step_time_seed(in_data, *hiddens): if self.dropout > 0: time_masks = [ 1 - self.dropout for layer in self.time_model.layers ] time_masks[0] = None else: time_masks = [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=time_masks) return new_states time_inputs = self.generate_seed_input[0:-1] n_time, n_note, n_ipn = time_inputs.shape time_outputs_info_seed = [ initial_state_with_taps(layer, n_note) for layer in self.time_model.layers ] time_result, _ = theano.scan(fn=step_time_seed, sequences=[time_inputs], outputs_info=time_outputs_info_seed) last_layer = get_last_layer(time_result) n_hidden = last_layer.shape[2] def step_time(*states): hiddens = list(states[:-2]) in_data = states[-2] time = states[-1] if self.dropout > 0: masks = [1 - self.dropout for layer in self.time_model.layers] masks[0] = None else: masks = [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) time_final = get_last_layer(new_states) start_note_values = theano.tensor.alloc(np.array(0, dtype=np.int8), self.output_size) note_outputs_info = ([ initial_state_with_taps(layer) for layer in self.pitch_model.layers ] + [dict(initial=start_note_values, taps=[-1])]) notes_result, updates = theano.scan(fn=self._predict_step_note, sequences=[time_final], outputs_info=note_outputs_info) output = get_last_layer(notes_result) next_input = OutputFormToInputFormOp(self.data_manager)(output, time + 1) return (ensure_list(new_states) + [next_input, time + 1, output]), updates time_outputs_info = (time_outputs_info_seed + [ dict(initial=self.generate_seed_input[-1], taps=[-1]), dict(initial=n_time, taps=[-1]), None ]) time_result, updates = theano.scan(fn=step_time, outputs_info=time_outputs_info, n_steps=self.steps_to_simulate) self.predicted_output = time_result[-1] self.generate_fun = theano.function(inputs=[ self.steps_to_simulate, self.conservativity, self.generate_seed_input ], outputs=self.predicted_output, updates=updates, allow_input_downcast=True, on_unused_input='warn') print("Done")
def BuildModel(modelSpecs, forTrain=True): rng = np.random.RandomState() ## x is for sequential features and y for matrix (or pairwise) features x = T.tensor3('x') y = T.tensor4('y') ## mask for x and y, respectively xmask = T.bmatrix('xmask') ymask = T.btensor3('ymask') xem = None ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): xem = T.tensor3('xem') ## bounding box for crop of a big protein distance matrix. This box allows crop at any position. box = None if forTrain: box = T.ivector('boundingbox') ## trainByRefLoss can be either 1 or -1. When this variable exists, we train the model using both reference loss and the loss of real data trainByRefLoss = None if forTrain and config.TrainByRefLoss(modelSpecs): trainByRefLoss = T.iscalar('trainByRefLoss') distancePredictor = ResNet4DistMatrix(rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, embedInput=xem, boundingbox=box, modelSpecs=modelSpecs) ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] ) labelList = [] if forTrain: ## when this model is used for training. We need to define the label variable for response in modelSpecs['responses']: labelType = Response2LabelType(response) rValDims = GetResponseValueDims(response) if labelType.startswith('Discrete'): if rValDims > 1: ## if one response is a vector, then we use a 4-d tensor ## wtensor is for 16bit integer labelList.append(T.wtensor4('Tlabel4' + response)) else: labelList.append(T.wtensor3('Tlabel4' + response)) else: if rValDims > 1: labelList.append(T.tensor4('Tlabel4' + response)) else: labelList.append(T.tensor3('Tlabel4' + response)) ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen) weightList = [] if len(labelList) > 0 and config.UseSampleWeight(modelSpecs): weightList = [ T.tensor3('Tweight4' + response) for response in modelSpecs['responses'] ] ## for prediction, both labelList and weightList are empty if forTrain: return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList, box, trainByRefLoss else: return distancePredictor, x, y, xmask, ymask, xem