def jointModelOutput(num_sub_activities, num_affordances, num_sub_activities_anticipation, num_affordances_anticipation, inputJointFeatures, inputHumanFeatures, inputObjectFeatures): shared_input_layer = TemporalInputFeatures(inputJointFeatures) shared_hidden_layer = LSTM('tanh', 'sigmoid', 'orthogonal', 4, 128) #shared_hidden_layer = simpleRNN('tanh','orthogonal',4,128) shared_layers = [shared_input_layer, shared_hidden_layer] human_layers = [ ConcatenateFeatures(inputHumanFeatures), LSTM('tanh', 'sigmoid', 'orthogonal', 4, 256) ] object_layers = [ ConcatenateFeatures(inputObjectFeatures), LSTM('tanh', 'sigmoid', 'orthogonal', 4, 256) ] human_anticipation = [softmax(num_sub_activities_anticipation)] human_detection = [softmax(num_sub_activities)] object_anticipation = [softmax(num_affordances_anticipation)] object_detection = [softmax(num_affordances)] trY_1_detection = T.lmatrix() trY_2_detection = T.lmatrix() trY_1_anticipation = T.lmatrix() trY_2_anticipation = T.lmatrix() sharedrnn = SharedRNNOutput(shared_layers, human_layers, object_layers, human_detection, human_anticipation, object_detection, object_anticipation, softmax_loss, trY_1_detection, trY_2_detection, trY_1_anticipation, trY_2_anticipation, 1e-3) return sharedrnn
def jointModelVectors(num_sub_activities, num_affordances, inputJointFeatures, inputHumanFeatures, inputObjectFeatures): shared_input_layer = TemporalInputFeatures(inputJointFeatures) shared_hidden_layer = LSTM('tanh', 'sigmoid', 'orthogonal', 4, 128) shared_layers = [shared_input_layer, shared_hidden_layer] human_layers = [ TemporalInputFeatures(inputHumanFeatures), LSTM('tanh', 'sigmoid', 'orthogonal', 4, 256) ] human_activity_classification = [ ConcatenateVectors(), softmax(num_sub_activities) ] object_layers = [ TemporalInputFeatures(inputObjectFeatures), LSTM('tanh', 'sigmoid', 'orthogonal', 4, 256) ] object_affordance_classification = [ ConcatenateVectors(), softmax(num_affordances) ] trY_1 = T.lmatrix() trY_2 = T.lmatrix() sharedrnn = SharedRNNVectors(shared_layers, human_layers, object_layers, human_activity_classification, object_affordance_classification, softmax_loss, trY_1, trY_2, 1e-3) return sharedrnn
def DRAmodelnoedge(nodeList,edgeList,edgeListComplete,edgeFeatures,nodeFeatures,nodeToEdgeConnections,clipnorm=25.0,train_for='joint'): edgeRNNs = {} edgeTypes = edgeList lstm_init = 'orthogonal' softmax_init = 'uniform' rng = np.random.RandomState(1234567890) for et in edgeTypes: inputJointFeatures = edgeFeatures[et] print inputJointFeatures edgeRNNs[et] = [TemporalInputFeatures(inputJointFeatures)] #128 nodeRNNs = {} nodeTypes = nodeList.keys() nodeLabels = {} outputLayer = {} for nt in nodeTypes: num_classes = nodeList[nt] #nodeRNNs[nt] = [LSTM('tanh','sigmoid',lstm_init,truncate_gradient=4,size=256,rng=rng),softmax(num_classes,softmax_init,rng=rng)] #256 nodeRNNs[nt] = [LSTM('tanh','sigmoid',lstm_init,truncate_gradient=4,size=args.nodeRNN_size,rng=rng)] #256 if train_for=='joint': nodeLabels[nt] = {} nodeLabels[nt]['detection'] = T.lmatrix() nodeLabels[nt]['anticipation'] = T.lmatrix() outputLayer[nt] = [softmax(num_classes,softmax_init,rng=rng),softmax(num_classes+1,softmax_init,rng=rng)] else: nodeLabels[nt] = T.lmatrix() outputLayer[nt] = [softmax(num_classes,softmax_init,rng=rng)] et = nt+'_input' edgeRNNs[et] = [TemporalInputFeatures(nodeFeatures[nt])] learning_rate = T.fscalar() dra = DRAanticipation(edgeRNNs,nodeRNNs,outputLayer,nodeToEdgeConnections,edgeListComplete,softmax_loss,nodeLabels,learning_rate,clipnorm,train_for=train_for) return dra
def test_maxpool_layer_forward_pass(): W_emb = [[0, 0, 0, 0, 1], [0, 0, 0, 1, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0]] W_emb = np.array(W_emb) W_dense = [[0, 0, 0, 0, 1, 0, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0,-0.5, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] W_dense = np.array(W_dense, dtype=float).T bounds = T.lmatrix('bounds') X = T.lmatrix('X') l_in1 = InputLayer((None, 2), input_var=bounds) l_in2 = InputLayer((None, 2), input_var=X) h1 = lasagne.layers.EmbeddingLayer(l_in2, input_size=4, output_size=5, W=W_emb) h2 = lasagne.layers.FlattenLayer(h1) h3 = lasagne.layers.DenseLayer(h2, num_units=5, nonlinearity=rectify, W=W_dense) l_pool = MaxpoolLayer([l_in1, h3]) predictions = get_output(l_pool) pred_func = theano.function([bounds, X], predictions, allow_input_downcast=True, on_unused_input='warn') test_bounds = np.array([[0, 4]]) test_X = np.array([[0, 1], [0, 0], [1, 1], [3, 3]]) print pred_func(test_bounds, test_X)
def multMatVect(v, A, m1, B, m2): # TODO : need description for parameter and return """ Multiply the first half of v by A with a modulo of m1 and the second half by B with a modulo of m2. Notes ----- The parameters of dot_modulo are passed implicitly because passing them explicitly takes more time than running the function's C-code. """ if multMatVect.dot_modulo is None: A_sym = tensor.lmatrix('A') s_sym = tensor.ivector('s') m_sym = tensor.iscalar('m') A2_sym = tensor.lmatrix('A2') s2_sym = tensor.ivector('s2') m2_sym = tensor.iscalar('m2') o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym) multMatVect.dot_modulo = function( [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o, profile=False) # This way of calling the Theano fct is done to bypass Theano overhead. f = multMatVect.dot_modulo f.input_storage[0].storage[0] = A f.input_storage[1].storage[0] = v[:3] f.input_storage[2].storage[0] = m1 f.input_storage[3].storage[0] = B f.input_storage[4].storage[0] = v[3:] f.input_storage[5].storage[0] = m2 f.fn() r = f.output_storage[0].storage[0] return r
def multMatVect(v, A, m1, B, m2): """ multiply the first half of v by A with a modulo of m1 and the second half by B with a modulo of m2 Note: The parameters of dot_modulo are passed implicitly because passing them explicitly takes more time then running the function's C-code. """ if multMatVect.dot_modulo is None: A_sym = tensor.lmatrix("A") s_sym = tensor.ivector("s") m_sym = tensor.iscalar("m") A2_sym = tensor.lmatrix("A2") s2_sym = tensor.ivector("s2") m2_sym = tensor.iscalar("m2") o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym) multMatVect.dot_modulo = function([A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o) # This way of calling the Theano fct is done to bypass Theano overhead. f = multMatVect.dot_modulo f.input_storage[0].storage[0] = A f.input_storage[1].storage[0] = v[:3] f.input_storage[2].storage[0] = m1 f.input_storage[3].storage[0] = B f.input_storage[4].storage[0] = v[3:] f.input_storage[5].storage[0] = m2 f.fn() r = f.output_storage[0].storage[0] return r
def test_multMatVect(): A1 = tensor.lmatrix('A1') s1 = tensor.ivector('s1') m1 = tensor.iscalar('m1') A2 = tensor.lmatrix('A2') s2 = tensor.ivector('s2') m2 = tensor.iscalar('m2') g0 = rng_mrg.DotModulo()(A1, s1, m1, A2, s2, m2) f0 = theano.function([A1, s1, m1, A2, s2, m2], g0) i32max = numpy.iinfo(numpy.int32).max A1 = numpy.random.randint(0, i32max, (3, 3)).astype('int64') s1 = numpy.random.randint(0, i32max, 3).astype('int32') m1 = numpy.asarray(numpy.random.randint(i32max), dtype="int32") A2 = numpy.random.randint(0, i32max, (3, 3)).astype('int64') s2 = numpy.random.randint(0, i32max, 3).astype('int32') m2 = numpy.asarray(numpy.random.randint(i32max), dtype="int32") f0.input_storage[0].storage[0] = A1 f0.input_storage[1].storage[0] = s1 f0.input_storage[2].storage[0] = m1 f0.input_storage[3].storage[0] = A2 f0.input_storage[4].storage[0] = s2 f0.input_storage[5].storage[0] = m2 r_a1 = rng_mrg.matVecModM(A1, s1, m1) r_a2 = rng_mrg.matVecModM(A2, s2, m2) f0.fn() r_b = f0.output_storage[0].value assert numpy.allclose(r_a1, r_b[:3]) assert numpy.allclose(r_a2, r_b[3:])
def jointModelOutput(num_sub_activities, num_affordances, num_sub_activities_anticipation, num_affordances_anticipation, inputJointFeatures, inputHumanFeatures, inputObjectFeatures): shared_input_layer = TemporalInputFeatures(inputJointFeatures) shared_hidden_layer = LSTM('tanh','sigmoid','orthogonal',4,128) #shared_hidden_layer = simpleRNN('tanh','orthogonal',4,128) shared_layers = [shared_input_layer,shared_hidden_layer] human_layers = [ConcatenateFeatures(inputHumanFeatures),LSTM('tanh','sigmoid','orthogonal',4,256)] object_layers = [ConcatenateFeatures(inputObjectFeatures),LSTM('tanh','sigmoid','orthogonal',4,256)] human_anticipation = [softmax(num_sub_activities_anticipation)] human_detection = [softmax(num_sub_activities)] object_anticipation = [softmax(num_affordances_anticipation)] object_detection = [softmax(num_affordances)] trY_1_detection = T.lmatrix() trY_2_detection = T.lmatrix() trY_1_anticipation = T.lmatrix() trY_2_anticipation = T.lmatrix() sharedrnn = SharedRNNOutput( shared_layers, human_layers, object_layers, human_detection, human_anticipation, object_detection, object_anticipation, softmax_loss, trY_1_detection, trY_2_detection,trY_1_anticipation,trY_2_anticipation,1e-3 ) return sharedrnn
def test_blocksparse_grad_merge(): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data() W = float32_shared_constructor(W_val) o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = numpy.asarray(0.05, dtype='float32') upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # not running with mode=gpu ensures that the elemwise is not merged in mode = None if theano.config.mode == 'FAST_COMPILE': mode = theano.compile.mode.get_mode('FAST_RUN') f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
def jointModel(num_sub_activities, num_affordances, inputJointFeatures, inputHumanFeatures, inputObjectFeatures): lstm_init = 'orthogonal' softmax_init = 'uniform' rng = np.random.RandomState(1234567890) shared_input_layer = TemporalInputFeatures(inputJointFeatures) shared_hidden_layer = LSTM('tanh', 'sigmoid', lstm_init, 4, 128, rng=rng) #shared_hidden_layer = simpleRNN('tanh','orthogonal',4,128) shared_layers = [shared_input_layer, shared_hidden_layer] human_layers = [ ConcatenateFeatures(inputHumanFeatures), LSTM('tanh', 'sigmoid', lstm_init, 4, 256, rng=rng), softmax(num_sub_activities, softmax_init, rng=rng) ] object_layers = [ ConcatenateFeatures(inputObjectFeatures), LSTM('tanh', 'sigmoid', lstm_init, 4, 256, rng=rng), softmax(num_affordances, softmax_init, rng=rng) ] trY_1 = T.lmatrix() trY_2 = T.lmatrix() sharedrnn = SharedRNN(shared_layers, human_layers, object_layers, softmax_loss, trY_1, trY_2, 1e-3) return sharedrnn
def test_multMatVect(): A1 = tensor.lmatrix('A1') s1 = tensor.ivector('s1') m1 = tensor.iscalar('m1') A2 = tensor.lmatrix('A2') s2 = tensor.ivector('s2') m2 = tensor.iscalar('m2') g0 = rng_mrg.DotModulo()(A1, s1, m1, A2, s2, m2) f0 = theano.function([A1, s1, m1, A2, s2, m2], g0) i32max = np.iinfo(np.int32).max A1 = np.random.randint(0, i32max, (3, 3)).astype('int64') s1 = np.random.randint(0, i32max, 3).astype('int32') m1 = np.asarray(np.random.randint(i32max), dtype="int32") A2 = np.random.randint(0, i32max, (3, 3)).astype('int64') s2 = np.random.randint(0, i32max, 3).astype('int32') m2 = np.asarray(np.random.randint(i32max), dtype="int32") f0.input_storage[0].storage[0] = A1 f0.input_storage[1].storage[0] = s1 f0.input_storage[2].storage[0] = m1 f0.input_storage[3].storage[0] = A2 f0.input_storage[4].storage[0] = s2 f0.input_storage[5].storage[0] = m2 r_a1 = rng_mrg.matVecModM(A1, s1, m1) r_a2 = rng_mrg.matVecModM(A2, s2, m2) f0.fn() r_b = f0.output_storage[0].value assert np.allclose(r_a1, r_b[:3]) assert np.allclose(r_a2, r_b[3:])
def train_minibatch_fn(self, evaluate=False): """ Initialize this Theano function once """ X = T.lmatrix('X_train') L_x = T.lvector('L_X_train') Y = T.lmatrix('Y_train') L_y = T.lvector('L_y_train') learning_rate = T.dscalar('learning_rate') momentum = T.dscalar('momentum') weight_decay = T.dscalar('weight_decay') loss, accuracy = self.loss(X, L_x, Y, L_y, weight_decay) updates = self.get_sgd_updates(loss, learning_rate, momentum) outputs = [loss, accuracy] if evaluate: precision, recall = self.evaluate(X, L_x, Y, L_y) outputs = outputs + [precision, recall] return theano.function( inputs=[X, L_x, Y, L_y, learning_rate, momentum, weight_decay], outputs=outputs, updates=updates )
def multMatVect(v, A, m1, B, m2): # TODO : need description for parameter and return """ Multiply the first half of v by A with a modulo of m1 and the second half by B with a modulo of m2. Notes ----- The parameters of dot_modulo are passed implicitly because passing them explicitly takes more time than running the function's C-code. """ if multMatVect.dot_modulo is None: A_sym = tensor.lmatrix("A") s_sym = tensor.ivector("s") m_sym = tensor.iscalar("m") A2_sym = tensor.lmatrix("A2") s2_sym = tensor.ivector("s2") m2_sym = tensor.iscalar("m2") o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym) multMatVect.dot_modulo = function( [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o, profile=False) # This way of calling the Theano fct is done to bypass Theano overhead. f = multMatVect.dot_modulo f.input_storage[0].storage[0] = A f.input_storage[1].storage[0] = v[:3] f.input_storage[2].storage[0] = m1 f.input_storage[3].storage[0] = B f.input_storage[4].storage[0] = v[3:] f.input_storage[5].storage[0] = m2 f.fn() r = f.output_storage[0].storage[0] return r
def arch_memnet_selfsup(self): ''' memory net with self supervision. ''' contexts = T.ltensor3('contexts') querys = T.lmatrix('querys') yvs = T.lmatrix('yvs') params = [] question_layer = Embed(self.vocab_size, self.hidden_dim) q = T.reshape(question_layer(querys.flatten()), (self.batchsize, self.sen_maxlen, self.hidden_dim) ) if self.kwargs.get('position_encoding'): lmat = position_encoding(self.sen_maxlen, self.hidden_dim).dimshuffle('x', 0, 1) print '[memory network] use PE' q = q * lmat u = mean(q, axis=1) params.extend(question_layer.params) mem_layer = MemoryLayer(self.batchsize, self.mem_size, self.unit_size, self.vocab_size, self.hidden_dim, **self.kwargs) probs = mem_layer.get_probs(contexts, u).dimshuffle(0, 2) inputs = { 'contexts': contexts, 'querys': querys, 'yvs': yvs, 'cvs': T.lmatrix('cvs') } return (probs, inputs, params)
def get_sampling_model_and_input(exp_config): # Create Theano variables encoder = BidirectionalEncoder( exp_config['src_vocab_size'], exp_config['enc_embed'], exp_config['enc_nhids']) decoder = Decoder( exp_config['trg_vocab_size'], exp_config['dec_embed'], exp_config['dec_nhids'], exp_config['enc_nhids'] * 2, loss_function='min_risk' ) # Create Theano variables logger.info('Creating theano variables') sampling_source_input = tensor.lmatrix('source') sampling_target_prefix_input = tensor.lmatrix('target') # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_source_input, tensor.ones(sampling_source_input.shape)) generated = decoder.generate(sampling_source_input, sampling_representation, target_prefix=sampling_target_prefix_input) # build the model that will let us get a theano function from the sampling graph logger.info("Creating Sampling Model...") sampling_model = Model(generated) # Set the parameters from a trained models logger.info("Loading parameters from model: {}".format(exp_config['saved_parameters'])) # load the parameter values from an .npz file param_values = LoadNMT.load_parameter_values(exp_config['saved_parameters'], brick_delimiter='-') LoadNMT.set_model_parameters(sampling_model, param_values) return sampling_model, sampling_source_input, encoder, decoder
def create_phones_encoder(config): encoder = BidirectionalPhonesEncoder(config['phones_vocab_size'], config['enc_embed'], config['enc_nhids']) encoder.weights_init = IsotropicGaussian(config['weight_scale']) encoder.biases_init = Constant(0) encoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.embedding.prototype.weights_init = Orthogonal() encoder.initialize() phones = tensor.lmatrix('phones') phones_mask = tensor.matrix('phones_mask') phones_words_ends = tensor.lmatrix('phones_words_ends') phones_words_ends_mask = tensor.matrix('phones_words_ends_mask') training_representation = encoder.apply(phones, phones_mask, phones_words_ends, phones_words_ends_mask) training_representation.name = "phones_representation" sampling_phones = tensor.lmatrix('sampling_phones') sampling_phones_mask = tensor.ones( (sampling_phones.shape[0], sampling_phones.shape[1])) sampling_phones_words_ends = tensor.lmatrix('sampling_phones_words_ends') sampling_phones_words_ends_mask = tensor.ones( (sampling_phones_words_ends.shape[0], sampling_phones_words_ends.shape[1])) sampling_representation = encoder.apply(sampling_phones, sampling_phones_mask, sampling_phones_words_ends, sampling_phones_words_ends_mask) return encoder, training_representation, sampling_representation
def create_audio_encoder(config): encoder = BidirectionalAudioEncoder(config['audio_feat_size'], config['enc_embed'], config['enc_nhids']) encoder.weights_init = IsotropicGaussian(config['weight_scale']) encoder.biases_init = Constant(0) encoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.embedding.prototype.weights_init = Orthogonal() encoder.initialize() audio = tensor.ftensor3('audio') audio_mask = tensor.matrix('audio_mask') words_ends = tensor.lmatrix('words_ends') words_ends_mask = tensor.matrix('words_ends_mask') training_representation = encoder.apply(audio, audio_mask, words_ends, words_ends_mask) training_representation.name = "audio_representation" sampling_audio = tensor.ftensor3('sampling_audio') sampling_audio_mask = tensor.ones( (sampling_audio.shape[0], sampling_audio.shape[1])) sampling_words_ends = tensor.lmatrix('sampling_words_ends') sampling_words_ends_mask = tensor.ones( (sampling_words_ends.shape[0], sampling_words_ends.shape[1])) sampling_representation = encoder.apply(sampling_audio, sampling_audio_mask, sampling_words_ends, sampling_words_ends_mask) return encoder, training_representation, sampling_representation
def DRAmodelnoedge(nodeList, edgeList, edgeListComplete, edgeFeatures, nodeFeatures, nodeToEdgeConnections, clipnorm=25.0, train_for='joint'): edgeRNNs = {} edgeTypes = edgeList lstm_init = 'orthogonal' softmax_init = 'uniform' rng = np.random.RandomState(1234567890) for et in edgeTypes: inputJointFeatures = edgeFeatures[et] print inputJointFeatures edgeRNNs[et] = [TemporalInputFeatures(inputJointFeatures)] #128 nodeRNNs = {} nodeTypes = nodeList.keys() nodeLabels = {} outputLayer = {} for nt in nodeTypes: num_classes = nodeList[nt] #nodeRNNs[nt] = [LSTM('tanh','sigmoid',lstm_init,truncate_gradient=4,size=256,rng=rng),softmax(num_classes,softmax_init,rng=rng)] #256 nodeRNNs[nt] = [ LSTM('tanh', 'sigmoid', lstm_init, truncate_gradient=4, size=args.nodeRNN_size, rng=rng) ] #256 if train_for == 'joint': nodeLabels[nt] = {} nodeLabels[nt]['detection'] = T.lmatrix() nodeLabels[nt]['anticipation'] = T.lmatrix() outputLayer[nt] = [ softmax(num_classes, softmax_init, rng=rng), softmax(num_classes + 1, softmax_init, rng=rng) ] else: nodeLabels[nt] = T.lmatrix() outputLayer[nt] = [softmax(num_classes, softmax_init, rng=rng)] et = nt + '_input' edgeRNNs[et] = [TemporalInputFeatures(nodeFeatures[nt])] learning_rate = T.fscalar() dra = DRAanticipation(edgeRNNs, nodeRNNs, outputLayer, nodeToEdgeConnections, edgeListComplete, softmax_loss, nodeLabels, learning_rate, clipnorm, train_for=train_for) return dra
def multMatVect(v, A, m1, B, m2): """ multiply the first half of v by A with a modulo of m1 and the second half by B with a modulo of m2 Note: The parameters of dot_modulo are passed implicitly because passing them explicitly takes more time then running the function's C-code. """ if multMatVect.dot_modulo is None: A_sym = tensor.lmatrix('A') s_sym = tensor.ivector('s') m_sym = tensor.iscalar('m') A2_sym = tensor.lmatrix('A2') s2_sym = tensor.ivector('s2') m2_sym = tensor.iscalar('m2') o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym) multMatVect.dot_modulo = function( [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o) # This way of calling the Theano fct is done to bypass Theano overhead. f = multMatVect.dot_modulo f.input_storage[0].storage[0] = A f.input_storage[1].storage[0] = v[:3] f.input_storage[2].storage[0] = m1 f.input_storage[3].storage[0] = B f.input_storage[4].storage[0] = v[3:] f.input_storage[5].storage[0] = m2 f.fn() r = f.output_storage[0].storage[0] return r
def main(config, tr_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] src_vocab = pickle.load(open(config['src_vocab'], 'rb')) logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) # Set up model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # Reload model if necessary extensions = [LoadNMT(config['saveto'])] # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=None, data_stream=None, extensions=extensions ) for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training') char_embedding = encoder.decimator.apply(source_char_seq.T, source_sample_matrix, source_char_aux.T) embedding(Model(char_embedding), src_vocab)
def _generate_train_model_item_function(self): u = T.lvector('u') i = T.lmatrix('i') j = T.lmatrix('j') n1 = T.lvector('n1') n2 = T.lvector('n2') di = T.dvector('di') dj = T.dvector('dj') self.W1 = bpr_item.W self.H1 = theano.shared(H_item.astype('float32'), name='H') self.B1 = theano.shared(B_item.astype('float32'), name='B') self.M1 = theano.shared(numpy.random.random( (self._rank, self._rank)).astype('float64'), name='M1') self.M2 = theano.shared(numpy.random.random( (self._rank, self._rank)).astype('float64'), name='M2') self.K = theano.shared(numpy.random.rand(), name='K') self.D = theano.shared(numpy.random.rand(), name='D') self.N = theano.shared(numpy.random.random( self._bundle_rank).astype('float32'), name='N') x_ui = T.dot( T.dot(self.W1[u], self.M2), T.dot(self.M1, self.H1[i].sum(axis=1).T / n1)).diagonal() + self.K * (self.B1[i].T / n1).T.sum( axis=1) + self.N[n1] + self.D * di x_uj = T.dot( T.dot(self.W1[u], self.M2), T.dot(self.M1, self.H1[j].sum(axis=1).T / n2)).diagonal() + self.K * (self.B1[j].T / n2).T.sum( axis=1) + self.N[n2] + self.D * dj x_uij = T.nnet.sigmoid(x_ui - x_uj) obj = T.sum(T.log(x_uij) - self._lambda_u * (self.M1 ** 2).sum() - \ self._lambda_u * (self.M2 ** 2).sum() - self._lambda_d * (self.K**2) - self._lambda_d * (self.D**2)\ -self._lambda_p * (self.N[n2]**2) - self._lambda_p * (self.N[n1]**2)) cost = -obj g_cost_M1 = T.grad(cost=cost, wrt=self.M1) g_cost_M2 = T.grad(cost=cost, wrt=self.M2) g_cost_K = T.grad(cost=cost, wrt=self.K) g_cost_N = T.grad(cost=cost, wrt=self.N) g_cost_D = T.grad(cost=cost, wrt=self.D) updates = [(self.M1, self.M1 - self._learning_rate * .001 * g_cost_M1), (self.M2, self.M2 - self._learning_rate * .001 * g_cost_M2), (self.K, self.K - self._learning_rate * .001 * g_cost_K), (self.N, self.N - self._learning_rate * g_cost_N), (self.D, self.D - self._learning_rate * g_cost_D)] self.train_model_item = theano.function( inputs=[u, i, j, n1, n2, di, dj], outputs=cost, updates=updates)
def get_fns(self, input_dim=123, p_learning_rate=0.01, d_learning_rate=0.0001, p=0.23928176569346055): x = T.lmatrix('X') y = T.vector('y') m = T.lmatrix('mask_tr') primal_updates, loss_weighed, \ reward, primal_var = self.primal_step(x, y, p_learning_rate, input_dim, p, mask=m) [r, q] = primal_var dual_updates = self.dual_class.dual_updates(r=r, q=q) updates = primal_updates, dual_updates pu, du = updates primal_train_fn = theano.function([x, y, m], [r[0], self.alpha[0]], updates=primal_updates, name="Primal Train") dual_train_fn = theano.function([], [self.alpha[0], self.beta[0]], updates=dual_updates, name="Dual Train") def train_fn(x, y, mask): r0_d, r1_d = primal_train_fn(x, y, mask.transpose()) alpha_d, beta_d = dual_train_fn() return alpha_d, beta_d # Calculate Validation in batch_mode for speedup x_mat = T.lmatrix('x_mat') y_mat = T.vector('y_mat') mask_mat = T.lmatrix('mask_te') pred_labels = self.calc_cost(self.model, x_mat, y_mat, mask_mat) valid_th_fns = theano.function([x_mat, mask_mat], pred_labels) def valid_fns(X_mat, Y_mat, mask_mat, flag=0): Y_mat = Y_mat.ravel() pred_labels = valid_th_fns(X_mat, mask_mat).ravel() # print pred_labels, Y_mat # print np.sum(pred_labels == 0), np.sum(pred_labels == 1), # print np.sum(Y_mat == 1) # TPR = np.sum((pred_labels > 0.5) * 1.0 * # (Y_mat == 1)) / np.sum(Y_mat == 1) # TNR = np.sum((pred_labels <= 0.5) * 1.0 * # (Y_mat == 0)) / np.sum(Y_mat == 0) # print "TPR, TNR below" #P = np.mean(pred_labels) #N = np.mean(1 - pred_labels) # print TPR, TNR, np.sum(pred_labels), P, N return self.dual_class.perf(pred_labels, Y_mat, flag), pred_labels return train_fn, valid_fns
def test_correct_solution(self): x = tensor.lmatrix() y = tensor.lmatrix() z = tensor.lscalar() b = theano.tensor.nlinalg.lstsq()(x, y, z) f = function([x, y, z], b) TestMatrix1 = np.asarray([[2, 1], [3, 4]]) TestMatrix2 = np.asarray([[17, 20], [43, 50]]) TestScalar = np.asarray(1) f = function([x, y, z], b) m = f(TestMatrix1, TestMatrix2, TestScalar) self.assertTrue(np.allclose(TestMatrix2, np.dot(TestMatrix1, m[0])))
def test_blocksparse_gpu_gemv_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu) assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockGemv)
def test_blocksparse_gpu_gemv_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu) assert sum(1 for n in f.maker.fgraph.apply_nodes if isinstance(n.op, GpuSparseBlockGemv)) == 1
def test7(): A = T.lmatrix("A") A_start = T.lvector("A_start") f = T.lmatrix("f") tgt = T.ivector("tgt") v = Viterbi(A , A_start , f , tgt) decode = v.decode() ff = theano.function([A , A_start , f , tgt] , outputs = v.apply()) ff2 = theano.function([A , A_start , f , tgt] , decode) print ff2([[1 , 3 , 1] , [1 , 2 , 2] , [2 , 1 , 3]] , [1 , 2 , 1] , [[1 , 2 , 3] , [2 , 2 , 1] , [3 , 3 , 2] , [1 , 1 , 2]] , [1 , 2 , 1 , 2])
def setup_backprop(self): eta = T.scalar('eta_for_backprop') x = T.lvector('x_for_backprop') y = T.lvector('y_for_backprop') y_in_x_inds = T.lmatrix('y_in_x_inds_for_backprop') y_in_src_inds = T.lmatrix('y_in_src_inds_for_backprop') y_in_domain = T.lmatrix('y_in_domain_for_backprop') l2_reg = T.scalar('l2_reg_for_backprop') # Normal operation dec_init_state, annotations = self._symb_encoder(x) nll, p_y_seq, objective, updates = self._setup_backprop_with( dec_init_state, annotations, y, y_in_x_inds, y_in_src_inds, y_in_domain, eta, l2_reg) self._get_nll = theano.function( inputs=[x, y, y_in_x_inds, y_in_src_inds, y_in_domain], outputs=nll, on_unused_input='warn') self._backprop = theano.function(inputs=[ x, y, eta, y_in_x_inds, y_in_src_inds, y_in_domain, l2_reg ], outputs=[p_y_seq, objective], updates=updates, on_unused_input='warn') # Add distractors self._get_nll_distract = [] self._backprop_distract = [] if self.distract_num > 0: x_distracts = [ T.lvector('x_distract_%d_for_backprop' % i) for i in range(self.distract_num) ] all_annotations = [annotations] for i in range(self.distract_num): _, annotations_distract = self._symb_encoder(x_distracts[i]) all_annotations.append(annotations_distract) annotations_with_distract = T.concatenate(all_annotations, axis=0) nll_d, p_y_seq_d, objective_d, updates_d = self._setup_backprop_with( dec_init_state, annotations_with_distract, y, y_in_x_inds, y_in_src_inds, y_in_domain, eta, l2_reg) self._get_nll_distract = theano.function( inputs=[x, y, y_in_x_inds, y_in_src_inds, y_in_domain] + x_distracts, outputs=nll_d, on_unused_input='warn') self._backprop_distract = theano.function( inputs=[ x, y, eta, y_in_x_inds, y_in_src_inds, y_in_domain, l2_reg ] + x_distracts, outputs=[p_y_seq_d, objective_d], updates=updates_d)
def test_blocksparse_gpu_outer_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(), wrt=W)], mode=mode_with_gpu) assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockOuter)
def _generate_test_model_function(self): u = T.lvector('u') i = T.lmatrix('i') j = T.lmatrix('j') n1 = T.lvector('n1') n2 = T.lvector('n2') di = T.dvector('di') dj = T.dvector('dj') x_ui = T.dot(T.dot(self.W1[u],self.M2), T.dot(self.M1, self.H1[i].sum(axis=1).T/n1)).diagonal() + self.K*(self.B1[i].T/n1).T.sum(axis=1) + self.N[n1] + self.D*di x_uj = T.dot(T.dot(self.W1[u],self.M2), T.dot(self.M1, self.H1[j].sum(axis=1).T/n2)).diagonal() + self.K*(self.B1[j].T/n2).T.sum(axis=1) + self.N[n2] + self.D*dj x_uij = x_ui-x_uj self.test_model = theano.function(inputs=[u, i, j, n1, n2, di, dj], outputs=x_uij)
def test_blocksparse_inplace_gemv_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o) if theano.config.mode == "FAST_COMPILE": assert not f.maker.fgraph.toposort()[-1].op.inplace else: assert f.maker.fgraph.toposort()[-1].op.inplace
def getAlignment(self): unk_idx = self.config['unk_id'] source_sentence = tensor.lmatrix('source') target_sentence = tensor.lmatrix('target') ftrans = open('/Users/lqy/Documents/transout.txt','w',0) falign = gzip.open('/Users/lqy/Documents/alignmentout','w',0) sampling_representation = encoder.apply(source_sentence, tensor.ones(source_sentence.shape)) for i, line in enumerate(self.data_stream.get_epoch_iterator()): seq = self._oov_to_unk(line[0], self.config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) print "input_: ",input_
def test_lookup_table(): lt = LookupTable(5, 3) lt.allocate() lt.W.set_value(numpy.arange(15).reshape(5, 3).astype(theano.config.floatX)) x = tensor.lmatrix("x") y = lt.apply(x) f = theano.function([x], [y]) x_val = [[1, 2], [0, 3]] desired = numpy.array([[[3, 4, 5], [6, 7, 8]], [[0, 1, 2], [9, 10, 11]]], dtype=theano.config.floatX) assert_equal(f(x_val)[0], desired) # Test get_dim assert_equal(lt.get_dim(lt.apply.inputs[0]), 0) assert_equal(lt.get_dim(lt.apply.outputs[0]), lt.dim) assert_raises(ValueError, lt.get_dim, 'random_name') # Test feedforward interface assert lt.input_dim == 0 assert lt.output_dim == 3 lt.output_dim = 4 assert lt.output_dim == 4 def assign_input_dim(): lt.input_dim = 11 assert_raises(ValueError, assign_input_dim) lt.input_dim = 0
def test_ctc_targets(): LENGTH = 20 BATCHES = 4 CLASSES = 2 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') ctc_target = ctc_cost.get_targets(y, T.log(y_hat), y_mask, y_hat_mask) Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX) Y_hat[:, :, 0] = .7 Y_hat[:, :, 1] = .2 Y_hat[:, :, 2] = .1 Y_hat[3, :, 0] = .3 Y_hat[3, :, 1] = .4 Y_hat[3, :, 2] = .3 Y = np.zeros((2, BATCHES), dtype='int64') Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) target = ctc_target.eval({ y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask }) # Note that this part is the same as the cross entropy gradient grad = -target / Y_hat test_grad = finite_diff(Y, Y_hat, Y_mask, Y_hat_mask, eps=1e-2, n_steps=5) testing.assert_almost_equal(grad.flatten()[:5], test_grad.flatten()[:5], decimal=3)
def test10(): src = T.ltensor3("src") tgt = T.lmatrix("tgt") mask = T.matrix("mask") prd = T.matrix("prd") n_hids, vocab_size = 3, 60 hs = HierarchicalSoftmax(src, n_hids, vocab_size) #prd = hs.test() res = hs.cost(tgt, mask) x = [ [[1,1,1],[2,2,2],[3,3,3],[4,4,4]], [[3,3,3],[4,4,4],[5,5,5],[6,6,6]] ] y = [ [1,1,1,1], [1,1,1,1] ] m = [ [1,1,0,0], [1,1,0,0] ] fn3 = theano.function(inputs=[src,tgt,mask], outputs=[res], on_unused_input='ignore') res = fn3(x,y,m) print res , res[0].shape x_a = np.array(x) print x_a.shape, x_a[y]
def test_ctc_log_path_probabs(): LENGTH = 10 BATCHES = 3 CLASSES = 2 N_LABELS = 3 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') blanked_y, blanked_y_mask = ctc_cost._add_blanks( y=y, blank_symbol=1, y_mask=y_mask) p = ctc_cost._log_path_probabs(blanked_y, y_hat, blanked_y_mask, y_hat_mask, 1) Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX) Y_hat[:, :, 0] = .7 Y_hat[:, :, 1] = .2 Y_hat[:, :, 2] = .1 Y = np.zeros((N_LABELS, BATCHES), dtype='int64') Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) Y_hat_mask[-2:, 0] = 0 Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) forward_probs = p.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) assert forward_probs[-2, 0, 0] == -np.inf Y_mask[-1] = 0 forward_probs_y_mask = p.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) assert forward_probs_y_mask[-1, 1, -2] == -np.inf assert not np.isnan(forward_probs).any()
def __init__(self, R, k, E, U, EU, embedding_size): self.k = k # Slices count self.R = R self.embedding_size = embedding_size init_range = 0.07 init_range_W = 0.001 # Setup params #Tensor matrix W = np.random.uniform(low=-init_range_W, high=init_range_W, size=(self.embedding_size, self.embedding_size, k)) #Neural matrix V = np.random.uniform(low=-init_range, high=init_range, size=(2*self.embedding_size, k)) #Bias b = np.random.uniform(low=-init_range, high=init_range, size=(k,)) #Concatenation u = np.random.uniform(low=-init_range, high=init_range, size=(k, )) self.embedding_size_t = theano.shared(self.embedding_size) self.W = theano.shared(np.asarray(W, dtype=theano.config.floatX), name="W") self.E, self.U, self.EU = E, U, EU # Shared among networks self.V, self.b, self.u = theano.shared(np.asarray(V, dtype=theano.config.floatX), name="V"+str(R)), \ theano.shared(np.asarray(b, dtype=theano.config.floatX), name="b"+str(R)), \ theano.shared(np.asarray(u, dtype=theano.config.floatX), name="u"+str(R)) self.params = [self.W, self.U, self.V, self.b, self.u] self.input = T.lmatrix() self.inputs = [self.input] # For trainer
def __init__(self, dim, initializer=default_initializer, normalize=True, dropout=0, activation="tanh", verbose=True): super(NegativePhraseRAE, self).__init__(dim, initializer=initializer, normalize=normalize, dropout=dropout, activation=activation, verbose=verbose) self.neg_seq = T.lmatrix() self.neg_vectors = T.fmatrix() self.neg_scan_result, _ = theano.scan( self.encode, sequences=[self.neg_seq], outputs_info=[self.neg_vectors, None], name="neg_rae_build") # all Negative history vector in scan self.neg_history_output = self.neg_scan_result[0] self.neg_all_output = self.neg_history_output[-1] # Consider Negative Phrase Only One self.neg_output = ifelse( T.eq(self.neg_vectors.shape[0], 1), self.neg_vectors[0], # True self.neg_all_output[-1]) # False self.neg_loss_rec = ifelse( T.eq(self.neg_vectors.shape[0], 1), 0.0, # True T.sum(self.neg_scan_result[1])) # False
def test_ctc_pseudo_cost(): LENGTH = 500 BATCHES = 40 CLASSES = 2 N_LABELS = 45 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') pseudo_cost = ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask) Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX) Y_hat[:, :, 0] = .75 Y_hat[:, :, 1] = .2 Y_hat[:, :, 2] = .05 Y_hat[3, 0, 0] = .3 Y_hat[3, 0, 1] = .4 Y_hat[3, 0, 2] = .3 Y = np.zeros((N_LABELS, BATCHES), dtype='int64') Y[25:, :] = 1 Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) Y_mask[30:] = 0 cost = pseudo_cost.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) pseudo_grad = T.grad(ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask).sum(), y_hat) #test_grad2 = pseudo_grad.eval({y_hat: Y_hat, y: Y, # y_hat_mask: Y_hat_mask, y_mask: Y_mask}) # TODO: write some more meaningful asserts here assert cost.sum() > 0
def create_layers(self, X_dim, y_dim, random_state): initW = kitchen.init.GlorotUniform(random_state=random_state, gain='relu') initb = kitchen.init.Uniform(random_state=random_state) i0 = lasagne.layers.InputLayer(shape=(None, X_dim[0]), input_var=T.lmatrix('bounds')) i1 = lasagne.layers.InputLayer(shape=(None, X_dim[1]), input_var=T.lmatrix('X')) h1 = lasagne.layers.EmbeddingLayer(i1, input_size=X_dim[2], output_size=40, W=initW) h2 = lasagne.layers.DenseLayer(h1, num_units=40, nonlinearity=lasagne.nonlinearities.rectify, W=initW, b=initb) h3 = MaxpoolLayer([i0, h2]) o1 = lasagne.layers.DenseLayer(h3, num_units=1, nonlinearity=lasagne.nonlinearities.sigmoid, W=initW, b=initb) return (i0, i1), o1
def test_ctc_symmetry_logscale(): LENGTH = 5000 BATCHES = 3 CLASSES = 4 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask) Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX) Y_hat[:, :, 0] = .3 Y_hat[:, :, 1] = .2 Y_hat[:, :, 2] = .4 Y_hat[:, :, 3] = .1 Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) # default blank symbol is the highest class index (3 in this case) Y = np.repeat(np.array([0, 1, 2, 1, 2, 0, 2, 2, 2]), BATCHES).reshape((9, BATCHES)) # the masks for this test should be all ones. Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1], y_hat_mask: Y_hat_mask, y_mask: Y_mask}) testing.assert_almost_equal(forward_cost[0], backward_cost[0]) assert not np.isnan(forward_cost[0]) assert not np.isnan(backward_cost[0]) assert not np.isinf(np.abs(forward_cost[0])) assert not np.isinf(np.abs(backward_cost[0]))
def test_ctc_add_blanks(): BATCHES = 3 N_LABELS = 3 y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') blanked_y, blanked_y_mask = ctc_cost._add_blanks( y=y, blank_symbol=1, y_mask=y_mask) Y = np.zeros((N_LABELS, BATCHES), dtype='int64') Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) Y_mask[-1, 0] = 0 Blanked_y_mask = blanked_y_mask.eval({y_mask: Y_mask}) Blanked_y = blanked_y.eval({y: Y}) assert (Blanked_y == np.array([[1, 1, 1], [0, 0, 0], [1, 1, 1], [0, 0, 0], [1, 1, 1], [0, 0, 0], [1, 1, 1]], dtype='int32')).all() assert (Blanked_y_mask == np.array([[1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.], [0., 1., 1.], [0., 1., 1.]], dtype=floatX)).all()
def test_ctc_pseudo_cost_skip_softmax_stability(): LENGTH = 500 BATCHES = 40 CLASSES = 2 N_LABELS = 45 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') pseudo_cost = ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask, skip_softmax=True) Y_hat = np.asarray(np.random.normal(0, 1, (LENGTH, BATCHES, CLASSES + 1)), dtype=floatX) Y = np.zeros((N_LABELS, BATCHES), dtype='int64') Y[25:, :] = 1 Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) Y_mask[30:] = 0 pseudo_grad = T.grad(pseudo_cost.sum(), y_hat) test_grad = pseudo_grad.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) y_hat_softmax = T.exp(y_hat) / T.exp(y_hat).sum(2)[:, :, None] pseudo_cost2 = ctc_cost.pseudo_cost(y, y_hat_softmax, y_mask, y_hat_mask, skip_softmax=False) pseudo_grad2 = T.grad(pseudo_cost2.sum(), y_hat) test_grad2 = pseudo_grad2.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) testing.assert_almost_equal(test_grad, test_grad2, decimal=4)
def GRU_question(self, dimension_fact_embedding, num_hidden_units_questions, num_hidden_units_episodes, max_question_len, dimension_word_embeddings): self.question_idxs = T.lmatrix("question_indices") # as many columns as words in the context window and as many lines as words in the sentence self.question_mask = T.lvector("question_mask") q = self.emb[self.question_idxs].reshape((self.question_idxs.shape[0], dimension_word_embeddings)) # x basically represents the embeddings of the words IN the current sentence. So it is shape def slice_w(x, n): return x[n*num_hidden_units_questions:(n+1)*num_hidden_units_questions] def question_gru_recursion(x_cur, h_prev, q_mask): W_in_stacked = T.concatenate([self.W_question_reset_gate_x, self.W_question_update_gate_x, self.W_question_hidden_gate_x], axis=1) W_hid_stacked = T.concatenate([self.W_question_reset_gate_h, self.W_question_update_gate_h, self.W_question_hidden_gate_h], axis=1) input_n = T.dot(x_cur, W_in_stacked) hid_input = T.dot(h_prev, W_hid_stacked) resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = T.tanh(resetgate) updategate = T.tanh(updategate) hidden_update = slice_w(input_n, 2) + resetgate * slice_w(hid_input, 2) hidden_update = T.tanh(hidden_update) h_cur = (1 - updategate) * hidden_update + updategate * hidden_update h_cur = q_mask * h_cur + (1 - q_mask) * h_prev # h_cur = T.tanh(T.dot(self.W_fact_to_hidden, x_cur) + T.dot(self.W_hidden_to_hidden, h_prev)) return h_cur state = self.h0_questions for jdx in range(max_question_len): state = question_gru_recursion(q[jdx], state, self.question_mask[jdx]) return T.tanh(T.dot(state, self.W_question_to_vector) + self.b_question_to_vector)
def get_char_emb_function(self): """ Return embeddings, with OOVs replaced by context-estimation Used at test time """ oov_char = tensor.lmatrix('oov_char_pred') rnn_mask = tensor.matrix('rnn_mask_pred', dtype=floatX) self.inputs = [None] * (self.num_lstm_layers + 1) self.inputs[0] = self.embedding_layer.connect(self.x) self.rev_mask = self.mask[::-1] emb_mat = self.embedding_layer.embeddings[0] char_states = self.gemb.char_rnn.connect(oov_char, rnn_mask) # (oov_num, 2*char_hidden_dim) char_preact = char_states.dimshuffle((0,'x',1)) feat = self.gemb.mlp.connect(char_preact) # (oov_num, batch=1, num_words) probs = tensor.nnet.softmax(feat.reshape([feat.shape[0]*feat.shape[1], feat.shape[2]])) # (oov_num*batch, num_words) emb_reweight = probs.dimshuffle(0,1,'x') * emb_mat # (oov_num*batch, num_words, emb_dim) gembedding = emb_reweight.sum(axis=1).reshape([feat.shape[0], feat.shape[1], -1]) # ??? (oov_num, batch, emb_dim) return theano.function([self.x0, self.mask0, oov_char, rnn_mask], [gembedding, self.inputs[0]], name='char_gemb_pred', on_unused_input='warn', givens=({self.is_train: numpy.cast['int8'](1)}))
def __theano_init__(self): # Theano tensor for I/O X = T.lmatrix('X') Y = T.lvector('Y') N = T.lvector('N') # network structure l_in = L.layers.InputLayer(shape=(self.batch_size, self.n_gram), input_var = X) l_we = L.layers.EmbeddingLayer(l_in, self.vocab_size, self.word_dim, W = self.D) l_f1 = L.layers.DenseLayer(l_we, self.hidden_dim1, W = self.C, b = self.Cb) l_f2 = L.layers.DenseLayer(l_f1, self.hidden_dim2, W = self.M, b = self.Mb) l_out = L.layers.DenseLayer(l_f2, self.vocab_size, W = self.E, b = self.Eb, nonlinearity=None) # lasagne.layers.get_output produces a variable for the output of the net O = L.layers.get_output(l_out) # (batch_size, vocab_size) lossfunc = NCE(self.batch_size, self.vocab_size, self.noise_dist, self.noise_sample_size) loss = lossfunc.evaluate(O, Y, N) # loss = T.nnet.categorical_crossentropy(O, Y).mean() # Retrieve all parameters from the network all_params = L.layers.get_all_params(l_out, trainable=True) # Compute AdaGrad updates for training updates = L.updates.adadelta(loss, all_params) # Theano functions for training and computing cost self.train = theano.function([l_in.input_var, Y, N], loss, updates=updates, allow_input_downcast=True) self.compute_loss = theano.function([l_in.input_var, Y, N], loss, allow_input_downcast=True) self.weights = theano.function(inputs = [], outputs = [self.D, self.C, self.M, self.E, self.Cb, self.Mb, self.Eb])
def get_char_gemb_loss_function(self): oov_pos = tensor.lvector('oov_pos') oov_char = tensor.lmatrix('oov_char') rnn_mask = tensor.matrix('rnn_mask', dtype=floatX) oov_pos_x = oov_pos.flatten() oov_pos_y = tensor.arange(oov_pos_x.shape[0]) emb_mat = self.embedding_layer.embeddings[0] char_states = self.gemb.char_rnn.connect(oov_char, rnn_mask) # (batch, 2*char_hidden_dim) char_preact = char_states feat = self.gemb.mlp.connect(char_preact) # (batch, num_words) probs = tensor.nnet.softmax(feat) # (oov_num*batch, num_words) oov_num=1 fixed log_probs = tensor.log(probs) loss = CrossEntropyLoss().connect(inputs=log_probs, weights=None, labels=self.x[oov_pos_x,oov_pos_y,0].reshape([-1,1])) grads = gradient_clipping(tensor.grad(loss, self.gemb.params), self.max_grad_norm) updates = adadelta(self.gemb.params, grads) return theano.function([self.x0, self.mask0, oov_pos, oov_char, rnn_mask], loss, name='f_char_gemb_loss', updates=updates, on_unused_input='warn', givens=({self.is_train: numpy.cast['int8'](1)}))
def get_sampling_model_and_input(exp_config): # Create Theano variables encoder = BidirectionalEncoder(exp_config['src_vocab_size'], exp_config['enc_embed'], exp_config['enc_nhids']) decoder = Decoder(exp_config['trg_vocab_size'], exp_config['dec_embed'], exp_config['dec_nhids'], exp_config['enc_nhids'] * 2, loss_function='min_risk') # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) # build the model that will let us get a theano function from the sampling graph logger.info("Creating Sampling Model...") sampling_model = Model(generated) return sampling_model, sampling_input, encoder, decoder
def __init__(self, data, config, fast_predict=False): self.embedding_shapes = data.embedding_shapes self.lstm_type = config.lstm_cell self.lstm_hidden_size = int(config.lstm_hidden_size) self.num_lstm_layers = int(config.num_lstm_layers) self.max_grad_norm = float(config.max_grad_norm) self.vocab_size = data.word_dict.size() self.label_space_size = data.label_dict.size() self.unk_id = data.unk_id # Initialize layers and parameters self.embedding_layer = EmbeddingLayer(data.embedding_shapes, data.embeddings) self.params = [p for p in self.embedding_layer.params] self.rnn_layers = [None] * self.num_lstm_layers for l in range(self.num_lstm_layers): input_dim = self.embedding_layer.output_size if l == 0 else self.lstm_hidden_size input_dropout = config.input_dropout_prob if ( config.per_layer_dropout or l == 0) else 0.0 recurrent_dropout = config.recurrent_dropout_prob self.rnn_layers[l] = get_rnn_layer(self.lstm_type)( input_dim, self.lstm_hidden_size, input_dropout_prob=input_dropout, recurrent_dropout_prob=recurrent_dropout, fast_predict=fast_predict, prefix='lstm_{}'.format(l)) self.params.extend(self.rnn_layers[l].params) self.softmax_layer = SoftmaxLayer(self.lstm_hidden_size, self.label_space_size) self.params.extend(self.softmax_layer.params) # Build model # Shape of x: [seq_len, batch_size, num_features] self.x0 = tensor.ltensor3('x') self.y0 = tensor.lmatrix('y') self.mask0 = tensor.matrix('mask', dtype=floatX) self.is_train = tensor.bscalar('is_train') self.x = self.x0.dimshuffle(1, 0, 2) self.y = self.y0.dimshuffle(1, 0) self.mask = self.mask0.dimshuffle(1, 0) self.inputs = [None] * (self.num_lstm_layers + 1) self.inputs[0] = self.embedding_layer.connect(self.x) self.rev_mask = self.mask[::-1] for l, rnn in enumerate(self.rnn_layers): outputs = rnn.connect(self.inputs[l], self.mask if l % 2 == 0 else self.rev_mask, self.is_train) self.inputs[l + 1] = outputs[::-1] self.scores, self.pred = self.softmax_layer.connect(self.inputs[-1]) self.pred0 = self.pred.reshape([self.x.shape[0], self.x.shape[1]]).dimshuffle(1, 0)
def testrun(params,datasets): # 学習したパラメータを使って汎化性能のテストを行う。 w1 = params[0] w2 = params[1] b1 = params[2] b2 = params[3] costs = params[4] test_set_x, test_set_t = datasets x = T.dmatrix('x') h1 = T.nnet.relu( T.dot(x,w1) + b1 ) h2 = T.nnet.relu( T.dot(h1,w2) + b2 ) y = T.nnet.softmax( h2 ) t = T.lmatrix('t') f2 = theano.function(inputs = [x], outputs = y) print("------------------------") print("TEST MODEL IS READY!!") accuracy = calc_accuracy( test_set_t , f2(test_set_x) ) print("*********") print("OUR ACCURACY :", accuracy*100, " PER CENTO !!" ) print("*********") # loss-functionの描写 plt.plot( costs ,'-') plt.show()
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
def test_beam_search(): """Test beam search using the model from the reverse_words demo. Ideally this test should be done with a trained model, but so far only with a randomly initialized one. So it does not really test the ability to find the best output sequence, but only correctness of returned costs. """ rng = numpy.random.RandomState(1234) alphabet_size = 20 beam_size = 10 length = 15 reverser = WordReverser(10, alphabet_size) reverser.weights_init = reverser.biases_init = IsotropicGaussian(0.5) reverser.initialize() inputs = tensor.lmatrix('inputs') samples, = VariableFilter(bricks=[reverser.generator], name="outputs")( ComputationGraph(reverser.generate(inputs))) input_vals = numpy.tile(rng.randint(alphabet_size, size=(length,)), (beam_size, 1)).T search = BeamSearch(10, samples) results, mask, costs = search.search({inputs: input_vals}, 0, 3 * length) true_costs = reverser.cost( input_vals, numpy.ones((length, beam_size), dtype=floatX), results, mask).eval() true_costs = (true_costs * mask).sum(axis=0) assert_allclose(costs, true_costs, rtol=1e-5)
def test_ctc_targets(): LENGTH = 20 BATCHES = 4 CLASSES = 2 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') ctc_target = ctc_cost.get_targets(y, T.log(y_hat), y_mask, y_hat_mask) Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX) Y_hat[:, :, 0] = .7 Y_hat[:, :, 1] = .2 Y_hat[:, :, 2] = .1 Y_hat[3, :, 0] = .3 Y_hat[3, :, 1] = .4 Y_hat[3, :, 2] = .3 Y = np.zeros((2, BATCHES), dtype='int64') Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) target = ctc_target.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) # Note that this part is the same as the cross entropy gradient grad = -target / Y_hat test_grad = finite_diff(Y, Y_hat, Y_mask, Y_hat_mask, eps=1e-2, n_steps=5) testing.assert_almost_equal(grad.flatten()[:5], test_grad.flatten()[:5], decimal=3)
def test_minibatch_fn(self): """ Returns a theano function that evaluates a dataset """ X = T.lmatrix('X_test') L_x = T.lvector('L_X_test') Y = T.lmatrix('Y_test') L_y = T.lvector('L_y_test') precision, recall = self.evaluate(X, L_x, Y, L_y) return theano.function( inputs=[X, L_x, Y, L_y], outputs=[precision, recall] )