def __init__(self, sound_shape, num_units, main_layer_class, loss_func, updates_func): # входной тензор (кол-во батчей, кол-во записей, время, частота) input_X = T.tensor4("X") # сеть input_layer = InputLayer(shape=(None, 3) + sound_shape, input_var=input_X.swapaxes(2, 3)) all_output = main_layer_class(input_layer, sound_shape, num_units) # for loss vector_output = ReshapeLayer(all_output, (-1, 1, num_units)) # for use # предсказание нейронки all_predicted = get_output(all_output) # for loss vector_predicted = get_output(vector_output) # for use # функция ошибки loss = loss_func(all_predicted) # посчитать обновлённые веса с шагом по градиенту trainable_weights = get_all_params(all_output, trainable=True) updates_sgd = updates_func(loss, trainable_weights) # функция, которая обучает сеть на 1 шаг и возвращащет значение функции потерь self.fit = theano.function([input_X], loss, updates=updates_sgd) # функция, которая возвращает вектор голоса self.predict = theano.function([input_X], vector_predicted) self.all_output = all_output self.vector_output = vector_output self.all_predicted = all_predicted self.vector_predicted = vector_predicted
def create_iter_funcs_valid(l_out, bs=None, N=50, mc_dropout=False): X = T.tensor4('X') y = T.ivector('y') X_batch = T.tensor4('X_batch') y_batch = T.ivector('y_batch') if not mc_dropout: y_hat = layers.get_output(l_out, X, deterministic=True) else: if bs is None: raise ValueError('a fixed batch size is required for mc dropout') X_repeat = T.extra_ops.repeat(X, N, axis=0) y_sample = layers.get_output( l_out, X_repeat, deterministic=False) sizes = [X_repeat.shape[0] / X.shape[0]] * bs y_sample_split = T.as_tensor_variable( T.split(y_sample, sizes, bs, axis=0)) y_hat = T.mean(y_sample_split, axis=1) valid_loss = T.mean( T.nnet.categorical_crossentropy(y_hat, y)) valid_acc = T.mean( T.eq(y_hat.argmax(axis=1), y)) valid_iter = theano.function( inputs=[theano.Param(X_batch), theano.Param(y_batch)], outputs=[valid_loss, valid_acc], givens={ X: X_batch, y: y_batch, }, ) return valid_iter
def test_slice_layer(): from lasagne.layers import SliceLayer, InputLayer, get_output_shape,\ get_output from numpy.testing import assert_array_almost_equal as aeq in_shp = (3, 5, 2) l_inp = InputLayer(in_shp) l_slice_ax0 = SliceLayer(l_inp, axis=0, indices=0) l_slice_ax1 = SliceLayer(l_inp, axis=1, indices=slice(3, 5)) l_slice_ax2 = SliceLayer(l_inp, axis=-1, indices=-1) x = np.arange(np.prod(in_shp)).reshape(in_shp).astype('float32') x1 = x[0] x2 = x[:, 3:5] x3 = x[:, :, -1] assert get_output_shape(l_slice_ax0) == x1.shape assert get_output_shape(l_slice_ax1) == x2.shape assert get_output_shape(l_slice_ax2) == x3.shape aeq(get_output(l_slice_ax0, x).eval(), x1) aeq(get_output(l_slice_ax1, x).eval(), x2) aeq(get_output(l_slice_ax2, x).eval(), x3) # test slicing None dimension in_shp = (2, None, 2) l_inp = InputLayer(in_shp) l_slice_ax1 = SliceLayer(l_inp, axis=1, indices=slice(3, 5)) assert get_output_shape(l_slice_ax1) == (2, None, 2) aeq(get_output(l_slice_ax1, x).eval(), x2)
def init_model(self): print('Initializing model...') ra_input_var = T.tensor3('raw_audio_input') mc_input_var = T.tensor3('melody_contour_input') target_var = T.imatrix('targets') network = self.build_network(ra_input_var, mc_input_var) prediction = layers.get_output(network) prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = layers.get_all_params(network, trainable=True) updates = lasagne.updates.sgd(loss, params, learning_rate=0.02) test_prediction = layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), T.argmax(target_var, axis=1)), dtype=theano.config.floatX) print('Building functions...') self.train_fn = theano.function([ra_input_var, mc_input_var, target_var], [loss, prediction], updates=updates, on_unused_input='ignore') self.val_fn = theano.function([ra_input_var, mc_input_var, target_var], [test_loss, test_acc, test_prediction], on_unused_input='ignore') self.run_fn = theano.function([ra_input_var, mc_input_var], [prediction], on_unused_input='ignore')
def test(): w, h, c = 8, 8, 1 encoder, decoder = build_convnet_deep(w=w, h=h, c=c) encoder = layers_from_list_to_dict(encoder) decoder = layers_from_list_to_dict(decoder) print(encoder.keys(), decoder.keys()) x = encoder["input"].input_var f = theano.function([x], [layers.get_output(encoder["z_mean"], x), layers.get_output(encoder["z_log_sigma"], x)] ) X = np.random.uniform(size=(1, c, w, h)).astype(np.float32) m, s = f(X) print(m.shape, s.shape) z = decoder["input"].input_var D = (decoder["input"].output_shape)[1] Z = np.random.uniform(size=(1, D)).astype(np.float32) f = theano.function([z], layers.get_output(decoder["output"], z)) print(f(Z).shape) z = layers.get_output(encoder["z_mean"], x) f = theano.function([x], layers.get_output(decoder["output"], {encoder["input"]: x, decoder["input"]: z}), givens={decoder["input"].input_var: z}, on_unused_input='ignore') print(f(X).shape)
def __init__(self, dims, nonlinearities=None, dropouts=None, update_fn=None, batch_norm=False, loss_type='cosine_margin', margin=0.8): """Initialize a Siamese neural network Parameters: ----------- update_fn: theano function with 2 arguments (loss, params) Update scheme, default to adadelta batch_norm: bool Do batch normalisation on first layer, default to false """ assert len(dims) >= 3, 'Not enough dimmensions' if dropouts != None: dropouts = copy.copy(dropouts) assert len(dropouts) == len(dims) - 1 dropouts.append(0) else: dropouts = [0] * len(dims) if nonlinearities==None: nonlinearities = [nl.sigmoid] * (len(dims) -1) else: assert len(nonlinearities) == len(dims) - 1 if update_fn == None: update_fn = lasagne.updates.adadelta self.input_var1 = T.matrix('inputs1') self.input_var2 = T.matrix('inputs2') self.target_var = T.ivector('targets') # input layer network1 = layers.InputLayer((None, dims[0]), input_var=self.input_var1) network2 = layers.InputLayer((None, dims[0]), input_var=self.input_var2) if dropouts[0]: network1 = layers.DropoutLayer(network1, p=dropouts[0]) network2 = layers.DropoutLayer(network2, p=dropouts[0]) # hidden layers for dim, dropout, nonlin in zip(dims[1:], dropouts[1:], nonlinearities): network1 = layers.DenseLayer(network1, num_units=dim, W=lasagne.init.GlorotUniform(), nonlinearity=nonlin) network2 = layers.DenseLayer(network2, num_units=dim, W=network1.W, b=network1.b, nonlinearity=nonlin) if batch_norm: network1 = layers.batch_norm(network1) network2 = layers.batch_norm(network2) if dropout: network1 = layers.DropoutLayer(network1, p=dropout) network2 = layers.DropoutLayer(network2, p=dropout) self.network = [network1, network2] self.params = layers.get_all_params(network1, trainable=True) # util functions, completely stolen from Lasagne example self.prediction1 = layers.get_output(network1) self.prediction2 = layers.get_output(network2) # if non-determnistic: self.test_prediction1 = layers.get_output(network1, deterministic=True) self.test_prediction2 = layers.get_output(network2, deterministic=True) self.change_loss(loss_type, margin) self.change_update(update_fn)
def generate_theano_func(args, network, penalty, input_dict, target_var): prediction = get_output(network, input_dict) # loss = T.mean( target_var * ( T.log(target_var) - prediction )) loss = T.mean(categorical_crossentropy(prediction, target_var)) # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) ) # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params ) # penalty = regularize_layer_params(l_forward_1_lstm, l2) # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params) # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) ) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, input_dict, deterministic=True) # test_prediction = get_output(network, deterministic=True) # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction)) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) train_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True, ) if args.task == "sts": val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True, ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True, ) return train_fn, val_fn
def __init__(self): self.input_X = theano.tensor.tensor4("X") # self.X_reshaped = self.input_X.dimshuffle([0, 3, 1, 2]) self.target_y = theano.tensor.vector("target Y", dtype='int32') # Архитектура in_0 = Input(shape=[None, 1, 64, 64], input_var=self.input_X) in_downsample = Pool(in_0, [4, 4]) conv_0 = Conv(in_downsample, 64, (2, 2), nonlinearity=sigmoid) pool_0 = Pool(conv_0, (3, 3)) self.out = Dense(pool_0, num_units=7, nonlinearity=softmax) # load last state(if file exists) self.path = "{}/{}.npz".format(os.getcwd(), self.__class__.__name__) if os.path.exists(self.path): with np.load(self.path) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(self.out, param_values) self.predict_net = theano.compile.function([self.input_X], get_output(self.out)) # Для обучения (Можно удалить) self.all_weights = lasagne.layers.get_all_params(self.out) self.y_predicted = get_output(self.out) self.loss = lasagne.objectives.categorical_crossentropy(self.y_predicted, self.target_y).mean() self.accuracy = lasagne.objectives.categorical_accuracy(self.y_predicted, self.target_y).mean() self.updates = lasagne.updates.adadelta(self.loss, self.all_weights, learning_rate=0.01) self.train_fun = theano.function([self.input_X, self.target_y], [self.loss, self.accuracy], updates=self.updates, allow_input_downcast=True) self.accuracy_fun = theano.function([self.input_X, self.target_y], self.accuracy, allow_input_downcast=True)
def get_model(input_var, target_var, multiply_var): # input layer with unspecified batch size layer = InputLayer(shape=(None, 12, 64, 64), input_var=input_var) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var) layer = DimshuffleLayer(layer, (0, 'x', 1, 2, 3)) # Z-score? # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = Conv3DDNNLayer(incoming=layer, num_filters=1, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=sigmoid) layer_prediction = layer # Loss prediction = get_output(layer_prediction) loss = binary_crossentropy(prediction[:,0,:,:,:], target_var).mean() #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum params = get_all_params(layer_prediction, trainable=True) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, disabling dropout layers. test_prediction = get_output(layer_prediction, deterministic=True) test_loss = binary_crossentropy(test_prediction[:,0,:,:,:], target_var).mean() return test_prediction, prediction, loss, params
def dist_info_sym(self, obs_var, state_info_vars): n_batches, n_steps = obs_var.shape[:2] obs_var = obs_var.reshape((n_batches, n_steps, -1)) if self.state_include_action: prev_action_var = state_info_vars["prev_action"] all_input_var = TT.concatenate( [obs_var, prev_action_var], axis=2 ) else: all_input_var = obs_var if self.feature_network is None: return dict( prob=L.get_output( self.prob_network.output_layer, {self.l_input: all_input_var} ) ) else: flat_input_var = TT.reshape(all_input_var, (-1, self.input_dim)) return dict( prob=L.get_output( self.prob_network.output_layer, {self.l_input: all_input_var, self.feature_network.input_layer: flat_input_var} ) )
def test_reapply(): l_in1 = InputLayer([None,10],T.zeros([5,10])) l_d1 = DenseLayer(l_in1,20) l_d2 = DenseLayer(l_in1,30) l_cat = ConcatLayer([l_d1,l_d2]) l_d3 = DenseLayer(l_cat,20) l_in2 = InputLayer([None,10],T.zeros([5,10])) new_l_d3 = reapply(l_d3,{l_in1:l_in2}) #reapply the whole network to a new in get_output(new_l_d3).eval() l_in3 = InputLayer([None,30],T.zeros([5,30])) new_l_d3 = reapply(l_d3,{l_d2:l_in3}) #multiple inputs new_l_cat = reapply(l_cat,{l_d2:ConcatLayer([l_d1,l_d2]),l_in1:l_in2}) get_output(new_l_cat).eval() #multiple layers l1,l2 = reapply([l_d3,l_d2],{l_in1:l_in2}) outs = reapply({'d3':l_d3,'d2':l_d2},{l_in1:l_in2}) assert isinstance(outs,dict) outs['d3'],outs['d2']
def __init__(self, network_description): signal.signal(signal.SIGINT, self.signal_handler) self.name = network_description['name'] netbuilder = NetworkBuilder(network_description) self.shouldStopNow = False # Get the lasagne network using the network builder class that creates autoencoder with the specified architecture self.network = netbuilder.buildNetwork() self.encode_layer, self.encode_size = netbuilder.getEncodeLayerAndSize() self.t_input, self.t_target = netbuilder.getInputAndTargetVars() self.input_type = netbuilder.getInputType() self.batch_size = netbuilder.getBatchSize() rootLogger.info("Network: " + self.networkToStr()) # Reconstruction is just output of the network recon_prediction_expression = layers.get_output(self.network) # Latent/Encoded space is the output of the bottleneck/encode layer encode_prediction_expression = layers.get_output(self.encode_layer, deterministic=True) # Loss for autoencoder = reconstruction loss + weight decay regularizer loss = self.getReconstructionLossExpression(recon_prediction_expression, self.t_target) weightsl2 = lasagne.regularization.regularize_network_params(self.network, lasagne.regularization.l2) loss += (5e-5 * weightsl2) params = lasagne.layers.get_all_params(self.network, trainable=True) # SGD with momentum + Decaying learning rate self.learning_rate = theano.shared(lasagne.utils.floatX(0.01)) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=self.learning_rate) # Theano functions for calculating loss, predicting reconstruction, encoding self.trainAutoencoder = theano.function([self.t_input, self.t_target], loss, updates=updates) self.predictReconstruction = theano.function([self.t_input], recon_prediction_expression) self.predictEncoding = theano.function([self.t_input], encode_prediction_expression)
def get_model(input_images, input_position, input_mult, target_var): # number of SAX and distance between SAX slices #indexes = [] #for i in range(input_position.shape[0]): # indexes.append(numpy.where(input_position[i][:,0] == 0.)[0][0]) # input layer with unspecified batch size layer = InputLayer(shape=(None, 22, 30, 64, 64), input_var=input_images) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var) # Z-score? # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = Conv3DDNNLayer(incoming=layer, num_filters=22, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=sigmoid) layer_max = ExpressionLayer(layer, lambda X: X.max(1), output_shape='auto') layer_min = ExpressionLayer(layer, lambda X: X.min(1), output_shape='auto') layer_prediction = layer # image prediction prediction = get_output(layer_prediction) loss = binary_crossentropy(prediction, target_var).mean() #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum params = get_all_params(layer_prediction, trainable=True) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, disabling dropout layers. test_prediction = get_output(layer_prediction, deterministic=True) test_loss = binary_crossentropy(test_prediction, target_var).mean() return test_prediction, prediction, loss, params
def test_memory_cells(batch_size=3, seq_len=50, input_dim=8, n_hidden=16): # lasagne way l_in = InputLayer((None, seq_len, input_dim), input_var=theano.shared(np.random.normal(size=[batch_size, seq_len, input_dim])), name='input seq') l_lstm0 = LSTMLayer(l_in, n_hidden, name='lstm') l_gru0 = GRULayer(l_in, n_hidden, name='gru') f_predict0 = theano.function([], get_output([l_lstm0, l_gru0])) # agentnet way s_in = InputLayer((None, input_dim), name='in') s_prev_cell = InputLayer((None, n_hidden), name='cell') s_prev_hid = InputLayer((None, n_hidden), name='hid') s_lstm_cell, s_lstm_hid = LSTMCell(s_prev_cell, s_prev_hid, s_in, name='lstm') s_prev_gru = InputLayer((None, n_hidden), name='hid') s_gru = GRUCell(s_prev_gru, s_in, name='gru') rec = Recurrence(state_variables=OrderedDict({ s_lstm_cell: s_prev_cell, s_lstm_hid: s_prev_hid, s_gru: s_prev_gru}), input_sequences={s_in: l_in}, unroll_scan=False) state_seqs, _ = rec.get_sequence_layers() l_lstm1 = state_seqs[s_lstm_hid] l_gru1 = state_seqs[s_gru] f_predict1 = theano.function([], get_output([l_lstm1, l_gru1])) # lstm param transfer old_params = sorted(get_all_params(l_lstm0, trainable=True), key=lambda p: p.name) new_params = sorted(get_all_params(s_lstm_hid, trainable=True), key=lambda p: p.name) for old, new in zip(old_params, new_params): print (old.name, '<-', new.name) assert tuple(old.shape.eval()) == tuple(new.shape.eval()) old.set_value(new.get_value()) # gru param transfer old_params = sorted(get_all_params(l_gru0, trainable=True), key=lambda p: p.name) new_params = sorted(get_all_params(s_gru, trainable=True), key=lambda p: p.name) for old, new in zip(old_params, new_params): print (old.name, '<-', new.name) assert tuple(old.shape.eval()) == tuple(new.shape.eval()) old.set_value(new.get_value()) lstm0_out, gru0_out = f_predict0() lstm1_out, gru1_out = f_predict1() assert np.allclose(lstm0_out, lstm1_out) assert np.allclose(gru0_out, gru1_out)
def _create_iter_funcs(self, layers, objective, update, output_type): y_batch = output_type('y_batch') output_layer = list(layers.values())[-1] objective_params = self._get_params_for('objective') obj = objective(output_layer, **objective_params) if not hasattr(obj, 'layers'): # XXX breaking the Lasagne interface a little: obj.layers = layers loss_train = obj.get_loss(None, y_batch) loss_eval = obj.get_loss(None, y_batch, deterministic=True) predict_proba = get_output(output_layer, None, deterministic=True) try: transform = get_output([v for k, v in layers.items() if 'rmspool' in k or 'maxpool' in k][-1], None, deterministic=True) except IndexError: transform = get_output(layers.values()[-2], None, deterministic=True) if not self.regression: predict = predict_proba.argmax(axis=1) accuracy = T.mean(T.eq(predict, y_batch)) else: accuracy = loss_eval all_params = self.get_all_params(trainable=True) update_params = self._get_params_for('update') updates = update(loss_train, all_params, **update_params) input_layers = [layer for layer in layers.values() if isinstance(layer, InputLayer)] X_inputs = [theano.Param(input_layer.input_var, name=input_layer.name) for input_layer in input_layers] inputs = X_inputs + [theano.Param(y_batch, name="y")] train_iter = theano.function( inputs=inputs, outputs=[loss_train], updates=updates, ) eval_iter = theano.function( inputs=inputs, outputs=[loss_eval, accuracy], ) predict_iter = theano.function( inputs=X_inputs, outputs=predict_proba, ) transform_iter = theano.function( inputs=X_inputs, outputs=transform, ) return train_iter, eval_iter, predict_iter, transform_iter
def doClusteringWithKMeansLoss(self, dataset, epochs): ''' Trains the autoencoder with combined kMeans loss and reconstruction loss At the moment does not give good results :param dataset: Data on which the autoencoder is trained :param epochs: Number of training epochs :return: None - (side effect) saves the trained network params and latent space in appropriate location ''' batch_size = self.batch_size # Load the inputs in latent space produced by the pretrained autoencoder and use it to initialize cluster centers Z = np.load('saved_params/%s/z_%s.npy' % (dataset.name, self.name)) quality_desc, cluster_centers = evaluateKMeans(Z, dataset.labels, dataset.getClusterCount(), 'Initial') rootLogger.info(quality_desc) # Load network parameters - code borrowed from mnist lasagne example with np.load('saved_params/%s/m_%s.npz' % (dataset.name, self.name)) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(self.network, param_values, trainable=True) # reconstruction loss is just rms loss between input and reconstructed input reconstruction_loss = self.getReconstructionLossExpression(layers.get_output(self.network), self.t_target) # extent the network to do soft cluster assignments clustering_network = ClusteringLayer(self.encode_layer, dataset.getClusterCount(), cluster_centers, batch_size, self.encode_size) soft_assignments = layers.get_output(clustering_network) # k-means loss is the sum of distances from the cluster centers weighted by the soft assignments to the clusters kmeansLoss = self.getKMeansLoss(layers.get_output(self.encode_layer), soft_assignments, clustering_network.W, dataset.getClusterCount(), self.encode_size, batch_size) params = lasagne.layers.get_all_params(self.network, trainable=True) # total loss = reconstruction loss + lambda * kmeans loss weight_reconstruction = 1 weight_kmeans = 0.1 total_loss = weight_kmeans * kmeansLoss + weight_reconstruction * reconstruction_loss updates = lasagne.updates.nesterov_momentum(total_loss, params, learning_rate=0.01) trainKMeansWithAE = theano.function([self.t_input, self.t_target], total_loss, updates=updates) for epoch in range(epochs): error = 0 total_batches = 0 for batch in dataset.iterate_minibatches(self.input_type, batch_size, shuffle=True): inputs, targets = batch error += trainKMeansWithAE(inputs, targets) total_batches += 1 # For every 10th epoch, update the cluster centers and print the clustering accuracy and nmi - for checking if the network # is actually doing something meaningful - the labels are never used for training if (epoch + 1) % 10 == 0: for i, batch in enumerate(dataset.iterate_minibatches(self.input_type, batch_size, shuffle=False)): Z[i * batch_size:(i + 1) * batch_size] = self.predictEncoding(batch[0]) quality_desc, cluster_centers = evaluateKMeans(Z, dataset.labels, dataset.getClusterCount(), "%d/%d [%.4f]" % (epoch + 1, epochs, error / total_batches)) rootLogger.info(quality_desc) else: # Just print the training loss rootLogger.info("%-30s %8s %8s" % ("%d/%d [%.4f]" % (epoch + 1, epochs, error / total_batches), "", "")) if self.shouldStopNow: break # Save the inputs in latent space and the network parameters for i, batch in enumerate(dataset.iterate_minibatches(self.input_type, batch_size, shuffle=False)): Z[i * batch_size:(i + 1) * batch_size] = self.predictEncoding(batch[0]) np.save('saved_params/%s/pc_km_z_%s.npy' % (dataset.name, self.name), Z) np.savez('saved_params/%s/pc_km_m_%s.npz' % (dataset.name, self.name), *lasagne.layers.get_all_param_values(self.network, trainable=True))
def build_treatment_model(self, n_vars, **kwargs): input_vars = TT.matrix() instrument_vars = TT.matrix() targets = TT.vector() inputs = layers.InputLayer((None, n_vars), input_vars) inputs = layers.DropoutLayer(inputs, p=0.2) dense_layer = layers.DenseLayer(inputs, 2 * kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) dense_layer= layers.DropoutLayer(dense_layer, p=0.2) for _ in xrange(kwargs['n_dense_layers'] - 1): dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) self.treatment_output = layers.DenseLayer(dense_layer, 1, nonlinearity=nonlinearities.linear) init_params = layers.get_all_param_values(self.treatment_output) prediction = layers.get_output(self.treatment_output, deterministic=False) test_prediction = layers.get_output(self.treatment_output, deterministic=True) l2_cost = regularization.regularize_network_params(self.treatment_output, regularization.l2) loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost params = layers.get_all_params(self.treatment_output, trainable=True) param_updates = updates.adadelta(loss, params) self._train_fn = theano.function( [ input_vars, targets, instrument_vars, ], loss, updates=param_updates ) self._loss_fn = theano.function( [ input_vars, targets, instrument_vars, ], loss, ) self._output_fn = theano.function( [ input_vars, ], test_prediction, ) return init_params
def _create_iter_funcs(self, layers, objective, update, output_type): y_batch = output_type('y_batch') output_layer = layers[-1] objective_kw = self._get_params_for('objective') loss_train = objective( layers, target=y_batch, **objective_kw) loss_eval = objective( layers, target=y_batch, deterministic=True, **objective_kw) predict_proba = get_output(output_layer, None, deterministic=True) if not self.regression: predict = predict_proba.argmax(axis=1) accuracy = T.mean(T.eq(predict, y_batch)) else: accuracy = loss_eval all_params = self.get_all_params(trainable=True) update_params = self._get_params_for('update') updates = update(loss_train, all_params, **update_params) input_layers = [layer for layer in layers.values() if isinstance(layer, InputLayer)] X_inputs = [theano.Param(input_layer.input_var, name=input_layer.name) for input_layer in input_layers] inputs = X_inputs + [theano.Param(y_batch, name="y")] train_iter = theano.function( inputs=inputs, outputs=[loss_train], updates=updates, allow_input_downcast=True, ) eval_iter = theano.function( inputs=inputs, outputs=[loss_eval, accuracy], allow_input_downcast=True, ) predict_iter = theano.function( inputs=X_inputs, outputs=predict_proba, allow_input_downcast=True, ) #Ido addition: h_predict = get_output(layers[self.hiddenLayer_to_output], None, deterministic=True) output_last_hidden_layer_ = theano.function( inputs=X_inputs, outputs=h_predict, allow_input_downcast=True, ) return train_iter, eval_iter, predict_iter, output_last_hidden_layer_
def get_model(input_var, target_var, multiply_var): # input layer with unspecified batch size layer_input = InputLayer(shape=(None, 30, 80, 80), input_var=input_var) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var) layer_0 = DimshuffleLayer(layer_input, (0, 'x', 1, 2, 3)) # Z-score? # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer_1 = batch_norm(Conv3DDNNLayer(incoming=layer_0, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_2 = batch_norm(Conv3DDNNLayer(incoming=layer_1, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_3 = MaxPool3DDNNLayer(layer_2, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1)) layer_4 = DropoutLayer(layer_3, p=0.25) # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer_5 = batch_norm(Conv3DDNNLayer(incoming=layer_4, num_filters=32, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_6 = batch_norm(Conv3DDNNLayer(incoming=layer_5, num_filters=32, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_7 = MaxPool3DDNNLayer(layer_6, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1)) layer_8 = DropoutLayer(layer_7, p=0.25) # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer_5 = batch_norm(Conv3DDNNLayer(incoming=layer_8, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_6 = batch_norm(Conv3DDNNLayer(incoming=layer_5, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_7 = batch_norm(Conv3DDNNLayer(incoming=layer_6, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_8 = MaxPool3DDNNLayer(layer_7, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1)) layer_9 = DropoutLayer(layer_8, p=0.25) # LSTM layer = DimshuffleLayer(layer_9, (0,2,1,3,4)) # layer_prediction = LSTMLayer(layer, num_units=2, only_return_final=True, learn_init=True, cell=Gate(linear)) layer = LSTMLayer(layer, num_units=2, only_return_final=True, learn_init=True) layer_prediction = DenseLayer(layer, 2, nonlinearity=linear) # Output Layer # layer_hidden = DenseLayer(layer_flatten, 500, nonlinearity=linear) # layer_prediction = DenseLayer(layer_hidden, 2, nonlinearity=linear) # Loss prediction = get_output(layer_prediction) / multiply_var**2 loss = T.abs_(prediction - target_var) loss = loss.mean() #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum params = get_all_params(layer_prediction, trainable=True) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, disabling dropout layers. test_prediction = get_output(layer_prediction, deterministic=True) / multiply_var**2 test_loss = T.abs_(test_prediction - target_var) test_loss = test_loss.mean() # crps estimate crps = T.abs_(test_prediction - target_var).mean()/600 return test_prediction, crps, loss, params
def _compile(self): rc = self.rc # actor gradient step O = self.net.O V = ll.get_output(self.net.critic) params = self.net.actor_params regl_params = ll.get_all_params(self.net.actor, regularizable=True) regl = 0.5*rc['l2_actor']*tt.sum([tt.sum(p**2) for p in regl_params]) updates = rc['gradient_updates'](V.mean()+regl, params, learning_rate=rc['lr_actor']) self.update_actor = th.function([O], [V.mean()], updates=updates) # critic bellman error (test version, doesn't update parameters) U = tt.matrix() Q = ll.get_output(self.net.critic, inputs={self.net.actor: U}) Y = tt.matrix() J = 0.5*tt.mean((Y-Q)**2) self.J = th.function([O, U, Y], J) # critic bellman error (train version, does update parameters) regl_params = [p for p in ll.get_all_params(self.net.critic, regularizable=True) if p not in ll.get_all_params(self.net.actor)] regl = 0.5*rc['l2_critic']*tt.sum([tt.sum(p**2) for p in regl_params]) params = self.net.critic_params updates = rc['gradient_updates'](J+regl, params, learning_rate=rc['lr_critic']) self.update_critic = th.function([O, U, Y], J, updates=updates) # target network update updates = [] tau = rc['tau'] for p,tgt_p in zip(self.net.all_params, self.target_net.all_params): updates.append( (tgt_p, tau*p + (1-tau)*tgt_p) ) self.update_target = th.function([], [], updates=updates) # build cost function # TODO: handle this better through rc x = tt.vector() u = tt.vector() site_xpos = tt.matrix() # L2 costs c = 0.5*rc['l2_q']*tt.sum(x[:self.model['nq']]**2) c += 0.5*rc['l2_v']*tt.sum(x[-self.model['nv']:]**2) c += 0.5*rc['l2_u']*tt.sum(u**2) # Huber costs if rc['huber_site'] is not None: a = rc['huber_alpha'] d = site_xpos[0] - site_xpos[1] c += rc['huber_site']*(tt.sqrt(tt.sum(d**2) + a**2) - a) # compile cost function # TODO: remove need for 'on_unused_input' self.cost = th.function([x, u, site_xpos], c, on_unused_input='ignore')
def triplet_loss_iter(embedder, update_params={}): X_triplets = { 'anchor':T.tensor4(), 'positive':T.tensor4(), 'negative':T.tensor4(), } # each will be a batch of images final_emb_layer = embedder[-1] all_layers = ll.get_all_layers(embedder) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_triplets.items()} predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_triplets.items()} # each output should be batch_size x embed_size # should give us a vector of batch_size of distances btw anchor and positive alpha = 0.2 # FaceNet alpha triplet_pos = lambda pred: (pred['anchor'] - pred['positive']).norm(2,axis=1) triplet_neg = lambda pred: (pred['anchor'] - pred['negative']).norm(2,axis=1) triplet_distances = lambda pred: (triplet_pos(pred) - triplet_neg(pred) + alpha).clip(0, np.inf) triplet_failed = lambda pred: T.mean(triplet_distances(pred) > alpha) triplet_loss = lambda pred: T.sum(triplet_distances(pred)) decay = 0.001 reg = regularize_network_params(final_emb_layer, l2) * decay losses_reg = lambda pred: triplet_loss(pred) + reg loss_train = losses_reg(predicted_embeds_train) loss_train.name = 'TL' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder])) all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = adam(grads, all_params) #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) print("Compiling network for training") tic = time.time() train_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [triplet_loss(predicted_embeds_valid), losses_reg(predicted_embeds_valid), triplet_failed(predicted_embeds_valid)]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def contrastive_loss_iter(embedder, update_params={}): X_pairs = { 'img1':T.tensor4(), 'img2':T.tensor4(), } y = T.ivector() # basically class labels final_emb_layer = embedder[-1] all_layers = ll.get_all_layers(embedder) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_pairs.items()} predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_pairs.items()} margin = 1 # if distance is 0 that's bad distance = lambda pred: (pred['img1'] - pred['img2'] + 1e-7).norm(2, axis=1) contrastive_loss = lambda pred: T.mean(y*(distance(pred)) + (1 - y)*(margin - distance(pred)).clip(0,np.inf)) failed_matches = lambda pred: T.switch(T.eq(T.sum(y),0), 0, T.sum((y*distance(pred)) > margin) / T.sum(y)) failed_nonmatches = lambda pred: T.switch(T.eq(T.sum(1-y),0), 0, T.sum((1-y*distance(pred)) < margin) / T.sum(1-y)) failed_pairs = lambda pred: 0.5*failed_matches(pred) + 0.5*failed_nonmatches(pred) decay = 0.0001 reg = regularize_network_params(final_emb_layer, l2) * decay losses_reg = lambda pred: contrastive_loss(pred) + reg loss_train = losses_reg(predicted_embeds_train) loss_train.name = 'CL' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder])) all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = adam(grads, all_params) #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) print("Compiling network for training") tic = time.time() train_iter = theano.function([X_pairs['img1'], X_pairs['img2'], y], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X_pairs['img1'], X_pairs['img2'], y], [ contrastive_loss(predicted_embeds_valid), losses_reg(predicted_embeds_valid), failed_pairs(predicted_embeds_valid)]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def create_generator_func(layers, apply_updates=False): X = T.fmatrix('X') X_batch = T.fmatrix('X_batch') # no need to pass an input to l_prior_in here generator_outputs = get_output( layers['l_encoder_out'], X, deterministic=False) # so pass the output of the generator as the output of the concat layer discriminator_outputs = get_output( layers['l_discriminator_out'], inputs={ layers['l_prior_encoder_concat']: generator_outputs, }, deterministic=False ) # the discriminator learns to predict 1 for q(z|x), # so the generator should fool it into predicting 0 generator_targets = T.zeros_like(X_batch.shape[0]) # so the generator needs to push the discriminator's output to 0 generator_loss = T.mean( T.nnet.binary_crossentropy( discriminator_outputs, generator_targets, ) ) if apply_updates: # only layers that are part of the generator (i.e., encoder) # should be updated generator_params = get_all_params( layers['l_discriminator_out'], trainable=True, generator=True) generator_updates = nesterov_momentum( generator_loss, generator_params, 0.1, 0.0) else: generator_updates = None generator_func = theano.function( inputs=[ theano.In(X_batch), ], outputs=generator_loss, updates=generator_updates, givens={ X: X_batch, }, ) return generator_func
def loss_iter(segmenter, update_params={}): X = T.tensor4() y = T.tensor4() pixel_weights = T.tensor3() final_pred_layer = segmenter[-1] all_layers = ll.get_all_layers(segmenter) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_masks_train = ll.get_output(segmenter, X) predicted_mask_valid = ll.get_output(final_pred_layer, X, deterministic=True) thresh = 0.5 accuracy = lambda pred: T.mean(T.eq(T.argmax(pred, axis=1), T.argmax(y, axis=1))) true_pos = lambda pred: T.sum((pred[:,0,:,:] > thresh) * (y[:,0,:,:] > thresh)) false_pos = lambda pred: T.sum((pred[:,0,:,:] > thresh) - (y[:,0,:,:] > thresh)) precision = lambda pred: (true_pos(pred) / (true_pos(pred) + false_pos(pred))) pixel_weights_1d = pixel_weights.flatten(ndim=1) losses = lambda pred: T.mean(crossentropy_flat(pred + 1e-7, y + 1e-7) * pixel_weights_1d) decay = 0.0001 reg = regularize_network_params(final_pred_layer, l2) * decay losses_reg = lambda pred: losses(pred) + reg loss_train = T.sum([losses_reg(mask) for mask in predicted_masks_train]) loss_train.name = 'CE' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in segmenter])) all_params = ll.get_all_params(segmenter, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = adam(grads, all_params) #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) acc_train = accuracy(predicted_masks_train[-1]) acc_valid = accuracy(predicted_mask_valid) prec_train = precision(predicted_masks_train[-1]) prec_valid = precision(predicted_mask_valid) print("Compiling network for training") tic = time.time() train_iter = theano.function([X, y, pixel_weights], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X, y, pixel_weights], [losses(predicted_mask_valid), losses_reg(predicted_mask_valid), prec_valid]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def get_network(model): input_data = tensor.dmatrix('x') targets_var = tensor.dmatrix('y') network = layers.InputLayer((model['batch_size'], model['input_vars']), input_data) nonlin = nonlinearities.rectify if model['hidden_nonlinearity'] != 'ReLu': nonlin = nonlinearities.tanh prev_layer = network for l in range(model['nlayers']): fc = layers.DenseLayer(prev_layer, model['units'], nonlinearity=nonlin) if model['dropout']: fc = layers.DropoutLayer(fc, 0.5) prev_layer = fc output_lin = None if model['output_mode'] == OUTPUT_LOG: output_lin = nonlinearities.tanh output_layer = layers.DenseLayer(prev_layer, 1, nonlinearity=output_lin) predictions = layers.get_output(output_layer) if model['output_mode'] == OUTPUT_BOUNDED: (minth, maxth) = model['maxmin'][model['control']] maxt = theano.shared(np.ones((model['batch_size'], 1)) * maxth) mint = theano.shared(np.ones((model['batch_size'], 1)) * minth) predictions = tensor.min(tensor.concatenate([maxt, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) predictions = tensor.max(tensor.concatenate([mint, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) loss = objectives.squared_error(predictions, targets_var) loss = objectives.aggregate(loss, mode='mean') params = layers.get_all_params(output_layer) test_prediction = layers.get_output(output_layer, deterministic=True) test_loss = objectives.squared_error(test_prediction, targets_var) test_loss = test_loss.mean() updates_sgd = updates.sgd(loss, params, learning_rate=model['lr']) ups = updates.apply_momentum(updates_sgd, params, momentum=0.9) train_fn = theano.function([input_data, targets_var], loss, updates=ups) pred_fn = theano.function([input_data], predictions) val_fn = theano.function([input_data, targets_var], test_loss) return {'train': train_fn, 'eval': val_fn, 'pred': pred_fn, 'layers': output_layer}
def get_model(input_var, target_var, multiply_var): # input layer with unspecified batch size layer_both_0 = InputLayer(shape=(None, 30, 64, 64), input_var=input_var) # Z-score? # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_both_1 = batch_norm(Conv2DLayer(layer_both_0, 64, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_both_2 = batch_norm(Conv2DLayer(layer_both_1, 64, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_both_3 = MaxPool2DLayer(layer_both_2, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_both_4 = DropoutLayer(layer_both_3, p=0.25) # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_both_5 = batch_norm(Conv2DLayer(layer_both_4, 128, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_both_6 = batch_norm(Conv2DLayer(layer_both_5, 128, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_both_7 = MaxPool2DLayer(layer_both_6, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_both_8 = DropoutLayer(layer_both_7, p=0.25) # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_both_9 = batch_norm(Conv2DLayer(layer_both_8, 256, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_both_10 = batch_norm(Conv2DLayer(layer_both_9, 256, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_both_11 = batch_norm(Conv2DLayer(layer_both_10, 256, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_both_12 = MaxPool2DLayer(layer_both_11, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_both_13 = DropoutLayer(layer_both_12, p=0.25) # Flatten layer_flatten = FlattenLayer(layer_both_13) # Prediction layer_hidden = DenseLayer(layer_flatten, 500, nonlinearity=sigmoid) layer_prediction = DenseLayer(layer_hidden, 2, nonlinearity=linear) # Loss prediction = get_output(layer_prediction) / multiply_var loss = squared_error(prediction, target_var) loss = loss.mean() #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum params = get_all_params(layer_prediction, trainable=True) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, disabling dropout layers. test_prediction = get_output(layer_prediction, deterministic=True) / multiply_var test_loss = squared_error(test_prediction, target_var) test_loss = test_loss.mean() # crps estimate crps = T.abs_(test_prediction - target_var).mean()/600 return test_prediction, crps, loss, params
def get_train_loss(self, target_vars, params): assert len(target_vars) == 1 prediction = get_output(self.l_out) mean_loss = self.loss(prediction, target_vars[0]).mean() monitored = [('loss', mean_loss)] grads = T.grad(mean_loss, params) if self.options.monitor_grads: for p, grad in zip(params, grads): monitored.append(('grad/' + p.name, grad)) if self.options.monitor_activations: for name, layer in get_named_layers(self.l_out).iteritems(): monitored.append(('activation/' + name, get_output(layer))) return OrderedDict(monitored), grads, []
def _init_model(self, in_size, out_size, n_hid=10, learning_rate_sl=0.005, \ learning_rate_rl=0.005, batch_size=32, ment=0.1): # 2-layer MLP self.in_size = in_size # x and y coordinate self.out_size = out_size # up, down, right, left self.batch_size = batch_size self.learning_rate = learning_rate_rl self.n_hid = n_hid input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.imatrix('tm'), \ T.itensor3('am'), T.fvector('r') in_var = T.reshape(input_var, (input_var.shape[0]*input_var.shape[1],self.in_size)) l_mask_in = L.InputLayer(shape=(None,None), input_var=turn_mask) pol_in = T.fmatrix('pol-h') l_in = L.InputLayer(shape=(None,None,self.in_size), input_var=input_var) l_pol_rnn = L.GRULayer(l_in, n_hid, hid_init=pol_in, mask_input=l_mask_in) # B x H x D pol_out = L.get_output(l_pol_rnn)[:,-1,:] l_den_in = L.ReshapeLayer(l_pol_rnn, (turn_mask.shape[0]*turn_mask.shape[1], n_hid)) # BH x D l_out = L.DenseLayer(l_den_in, self.out_size, nonlinearity=lasagne.nonlinearities.softmax) self.network = l_out self.params = L.get_all_params(self.network) # rl probs = L.get_output(self.network) # BH x A out_probs = T.reshape(probs, (input_var.shape[0],input_var.shape[1],self.out_size)) # B x H x A log_probs = T.log(out_probs) act_probs = (log_probs*act_mask).sum(axis=2) # B x H ep_probs = (act_probs*turn_mask).sum(axis=1) # B H_probs = -T.sum(T.sum(out_probs*log_probs,axis=2),axis=1) # B self.loss = 0.-T.mean(ep_probs*reward_var + ment*H_probs) updates = lasagne.updates.rmsprop(self.loss, self.params, learning_rate=learning_rate_rl, \ epsilon=1e-4) self.inps = [input_var, turn_mask, act_mask, reward_var, pol_in] self.train_fn = theano.function(self.inps, self.loss, updates=updates) self.obj_fn = theano.function(self.inps, self.loss) self.act_fn = theano.function([input_var, turn_mask, pol_in], [out_probs, pol_out]) # sl sl_loss = 0.-T.mean(ep_probs) sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, learning_rate=learning_rate_sl, \ epsilon=1e-4) self.sl_train_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss, \ updates=sl_updates) self.sl_obj_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss)
def build_optimizer(network, placeholders, optimization, learning_rate): # build loss function if optimization['objective'] == 'lower_bound': if 'binary' in optimization: binary = optimization['binary'] else: binary = False loss, prediction = variational_lower_bound(network, placeholders['inputs'], deterministic=False, binary=binary) # regularize parameters loss += regularization(network['X'], optimization) params = layers.get_all_params(network['X'], trainable=True) else: prediction = layers.get_output(network['output'], deterministic=False) loss = build_loss(placeholders['targets'], prediction, optimization) # regularize parameters loss += regularization(network['output'], optimization) params = layers.get_all_params(network['output'], trainable=True) # calculate and clip gradients if "weight_norm" in optimization: weight_norm = optimization['weight_norm'] else: weight_norm = None grad = calculate_gradient(loss, params, weight_norm=weight_norm) # setup parameter updates update_op = build_updates(grad, params, optimization, learning_rate) # test/validation set if optimization['objective'] == 'lower_bound': test_loss, test_prediction = variational_lower_bound(network, placeholders['inputs'], deterministic=False, binary=binary) else: test_prediction = layers.get_output(network['output'], deterministic=True) test_loss = build_loss(placeholders['targets'], test_prediction, optimization) # create theano function train_fun = theano.function(list(placeholders.values()), [loss, prediction], updates=update_op) test_fun = theano.function(list(placeholders.values()), [test_loss, test_prediction]) return train_fun, test_fun
def __init__(self, conf): self.conf = conf if self.conf.act == "linear": self.conf.act = linear elif self.conf.act == "sigmoid": self.conf.act = sigmoid elif self.conf.act == "relu": self.conf.act = rectify elif self.conf.act == "tanh": self.conf.act = tanh else: raise ValueError("Unknown activation function", self.conf.act) input_var_first = T.matrix('inputs1') input_var_second = T.matrix('inputs2') target_var = T.matrix('targets') # create network self.autoencoder, encoder_first, encoder_second = self.__create_toplogy__(input_var_first, input_var_second) self.out = get_output(self.autoencoder) loss = squared_error(self.out, target_var) loss = loss.mean() params = get_all_params(self.autoencoder, trainable=True) updates = nesterov_momentum(loss, params, learning_rate=self.conf.lr, momentum=self.conf.momentum) # training function self.train_fn = theano.function([input_var_first, input_var_second, target_var], loss, updates=updates) # fuction to reconstruct test_reconstruction = get_output(self.autoencoder, deterministic=True) self.reconstruction_fn = theano.function([input_var_first, input_var_second], test_reconstruction) # encoding function test_encode = get_output([encoder_first, encoder_second], deterministic=True) self.encoding_fn = theano.function([input_var_first, input_var_second], test_encode) # utils blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] self.blas_nrm2 = blas('nrm2', np.array([], dtype=float)) self.blas_scal = blas('scal', np.array([], dtype=float)) # load weights if necessary if self.conf.load_model is not None: self.load_model()
word_index, tokenizer = data_process.get_tokenizer(data, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH) train_x, train_y, train_le, train_labels, _, tr_ids = data_process.get_dev_data_with_id( train_file, tokenizer, MAX_SEQUENCE_LENGTH, delim) dev_x, dev_y, dev_le, dev_labels, _, dev_ids = data_process.get_dev_data_with_id( dev_file, tokenizer, MAX_SEQUENCE_LENGTH, delim) test_x, test_y, test_le, test_labels, _, test_ids = data_process.get_dev_data_with_id( test_file, tokenizer, MAX_SEQUENCE_LENGTH, delim) domain_test_x, domain_test_y, domain_test_le, domain_test_labels, _, domain_ids = data_process.get_dev_data_with_id( domain_test_file, tokenizer, MAX_SEQUENCE_LENGTH, delim) x_sym = T.imatrix('inputs1') l_in = lasagne.layers.InputLayer((None, MAX_SEQUENCE_LENGTH), x_sym) model2 = build_convpool_max(l_in, emb_model, word_index, MAX_NB_WORDS, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH) output = get_output(model2, x_sym) # cnn = theano.function([x_sym], output) train_x = cnn(train_x) print(train_x.shape) dev_x = cnn(dev_x) print(dev_x.shape) test_x = cnn(test_x) print(test_x.shape) R, C = train_x.shape train_y = train_y.astype('int32') dev_y = dev_y.astype('int32') domain_test_x = cnn(domain_test_x) print(train_x.shape) print(dev_x.shape)
def ff(input_data, input_mask, network): predict_data = get_output(network, deterministic=True) predict_fn = theano.function(inputs=[input_data, input_mask], outputs=[predict_data]) return predict_fn
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats): print("Building model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen - kw + 1 stride = 1 #important context words as channels #CNN_sentence config filter_size = wordDim pool_size = seqlen - filter_size + 1 input = InputLayer((None, seqlen, num_feats), input_var=input_var) batchsize, _, _ = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim) #print get_output_shape(emb) reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats * wordDim)) #print get_output_shape(reshape) conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()) #nOutputFrame = num_flters, #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1 #print get_output_shape(conv1d) conv1d = DimshuffleLayer(conv1d, (0, 2, 1)) #print get_output_shape(conv1d) pool_size = num_filters maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) #print get_output_shape(maxpool) #forward = FlattenLayer(maxpool) #print get_output_shape(forward) hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = { emb: lambda_val, conv1d: lambda_val, hid: lambda_val, network: lambda_val } penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction, target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
def log_likelihood_sym(self, x_var, y_var): normalized_xs_var = (x_var - self._x_mean_var) / self._x_std_var prob = L.get_output( self._l_prob, {self._prob_network.input_layer: normalized_xs_var}) return self._dist.log_likelihood_sym(TT.cast(y_var, 'int32'), dict(prob=prob))
''' # zca whitener = ZCA(x=x_unlabelled) sym_x_l_zca = whitener.apply(sym_x_l) sym_x_eval_zca = whitener.apply(sym_x_eval) sym_x_u_zca = whitener.apply(sym_x_u) sym_x_u_rep_zca = whitener.apply(sym_x_u_rep) sym_x_u_d_zca = whitener.apply(sym_x_u_d) # init lasagne.layers.get_output(classifier, sym_x_u_zca, init=True) init_updates = [u for l in lasagne.layers.get_all_layers(classifier) for u in getattr(l, 'init_updates', [])] init_fn = theano.function([sym_x_u], [], updates=init_updates) # outputs gen_out_x = ll.get_output(gen_layers[-1], {gen_in_y:sym_y_g, gen_in_z:sym_z_rand}, deterministic=False) gen_out_x_zca = whitener.apply(gen_out_x) cla_out_y_l = ll.get_output(classifier, sym_x_l_zca, deterministic=False) cla_out_y_eval = ll.get_output(classifier, sym_x_eval_zca, deterministic=True) cla_out_y = ll.get_output(classifier, sym_x_u_zca, deterministic=False) cla_out_y_rep = ll.get_output(classifier, sym_x_u_rep_zca, deterministic=False) bn_updates = [u for l in lasagne.layers.get_all_layers(classifier) for u in getattr(l, 'bn_updates', [])] cla_out_y_d = ll.get_output(classifier, sym_x_u_d_zca, deterministic=False) cla_out_y_d_hard = cla_out_y_d.argmax(axis=1) cla_out_y_g = ll.get_output(classifier, gen_out_x_zca, deterministic=False) dis_out_p = ll.get_output(dis_layers[-1], {dis_in_x:T.concatenate([sym_x_l,sym_x_u_d], axis=0),dis_in_y:T.concatenate([sym_y,cla_out_y_d_hard], axis=0)}, deterministic=False) dis_out_p_g = ll.get_output(dis_layers[-1], {dis_in_x:gen_out_x,dis_in_y:sym_y_g}, deterministic=False) # argmax cla_out_y_hard = cla_out_y.argmax(axis=1)
def __init__( self, env_spec, env, pkl_path=None, json_path=None, npz_path=None, trainable_snn=True, ##CF - latents units at the input latent_dim=3, # we keep all these as the dim of the output of the other MLP and others that we will need! latent_name='categorical', bilinear_integration=False, # again, needs to match! resample=False, # this can change: frequency of resampling the latent? hidden_sizes_snn=(32, 32), hidden_sizes_selector=(10, 10), external_latent=False, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, min_std=1e-4, ): self.latent_dim = latent_dim ## could I avoid needing this self for the get_action? self.latent_name = latent_name self.bilinear_integration = bilinear_integration self.resample = resample self.min_std = min_std self.hidden_sizes_snn = hidden_sizes_snn self.hidden_sizes_selector = hidden_sizes_selector self.pre_fix_latent = np.array( [] ) # if this is not empty when using reset() it will use this latent self.latent_fix = np.array( []) # this will hold the latents variable sampled in reset() self.shared_latent_var = theano.shared( self.latent_fix) # this is for external lat! update that self._set_std_to_0 = False self.trainable_snn = trainable_snn self.external_latent = external_latent self.pkl_path = pkl_path self.json_path = json_path self.npz_path = npz_path self.old_policy = None if self.json_path: # there is another one after defining all the NN to warm-start the params of the SNN data = json.load( open(os.path.join(config.PROJECT_PATH, self.json_path), 'r')) # I should do this with the json file self.old_policy_json = data['json_args']["policy"] self.latent_dim = self.old_policy_json['latent_dim'] self.latent_name = self.old_policy_json['latent_name'] self.bilinear_integration = self.old_policy_json[ 'bilinear_integration'] self.resample = self.old_policy_json[ 'resample'] # this could not be needed... self.min_std = self.old_policy_json['min_std'] self.hidden_sizes_snn = self.old_policy_json['hidden_sizes'] elif self.pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path)) self.old_policy = data["policy"] self.latent_dim = self.old_policy.latent_dim self.latent_name = self.old_policy.latent_name self.bilinear_integration = self.old_policy.bilinear_integration self.resample = self.old_policy.resample # this could not be needed... self.min_std = self.old_policy.min_std self.hidden_sizes_snn = self.old_policy.hidden_sizes if self.latent_name == 'normal': self.latent_dist = DiagonalGaussian(self.latent_dim) self.latent_dist_info = dict(mean=np.zeros(self.latent_dim), log_std=np.zeros(self.latent_dim)) elif self.latent_name == 'bernoulli': self.latent_dist = Bernoulli(self.latent_dim) self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim)) elif self.latent_name == 'categorical': self.latent_dist = Categorical(self.latent_dim) if self.latent_dim > 0: self.latent_dist_info = dict(prob=1. / self.latent_dim * np.ones(self.latent_dim)) else: self.latent_dist_info = dict(prob=np.ones( self.latent_dim)) # this is an empty array else: raise NotImplementedError Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) # retrieve dimensions and check consistency if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance( env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim) all_obs_dim = env_spec.observation_space.flat_dim assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim if self.external_latent: # in case we want to fix the latent externally l_all_obs_var = L.InputLayer( shape=(None, ) + (self.obs_robot_dim + self.obs_maze_dim, )) all_obs_var = l_all_obs_var.input_var # l_selection = ConstOutputLayer(incoming=l_all_obs_var, output_var=self.shared_latent_var) l_selection = ParamLayer(incoming=l_all_obs_var, num_units=self.latent_dim, param=self.shared_latent_var, trainable=False) selection_var = L.get_output(l_selection) else: # create network with softmax output: it will be the latent 'selector'! latent_selection_network = MLP( input_shape=(self.obs_robot_dim + self.obs_maze_dim, ), output_dim=self.latent_dim, hidden_sizes=self.hidden_sizes_selector, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_all_obs_var = latent_selection_network.input_layer all_obs_var = latent_selection_network.input_layer.input_var # collect the output to select the behavior of the robot controller (equivalent to latents) l_selection = latent_selection_network.output_layer selection_var = L.get_output(l_selection) # split all_obs into the robot and the maze obs --> ROBOT goes first!! l_obs_robot = CropLayer(l_all_obs_var, start_index=None, end_index=self.obs_robot_dim) l_obs_maze = CropLayer(l_all_obs_var, start_index=self.obs_robot_dim, end_index=None) obs_robot_var = all_obs_var[:, :self.obs_robot_dim] obs_maze_var = all_obs_var[:, self.obs_robot_dim:] # Enlarge obs with the selectors (or latents). Here just computing the final input dim if self.bilinear_integration: l_obs_snn = BilinearIntegrationLayer([l_obs_robot, l_selection]) else: l_obs_snn = L.ConcatLayer([l_obs_robot, l_selection]) action_dim = env_spec.action_space.flat_dim # create the action network mean_network = MLP( input_layer= l_obs_snn, # input the layer that handles the integration of the selector output_dim=action_dim, hidden_sizes=self.hidden_sizes_snn, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="meanMLP", ) self._layers_mean = mean_network.layers l_mean = mean_network.output_layer if adaptive_std: log_std_network = MLP(input_layer=l_obs_snn, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, name="log_stdMLP") l_log_std = log_std_network.output_layer self._layers_log_std = log_std_network.layers else: l_log_std = ParamLayer( incoming=mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self._layers_log_std = [l_log_std] self._layers_snn = self._layers_mean + self._layers_log_std # this returns a list with the "snn" layers if not self.trainable_snn: for layer in self._layers_snn: for param, tags in layer.params.items( ): # params of layer are OrDict: key=the shared var, val=tags tags.remove("trainable") if self.json_path and self.npz_path: warm_params_dict = dict( np.load(os.path.join(config.PROJECT_PATH, self.npz_path))) # keys = list(param_dict.keys()) self.set_params_snn(warm_params_dict) elif self.pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path)) warm_params = data['policy'].get_params_internal() self.set_params_snn(warm_params) mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy_snn_hier, self).__init__(env_spec) # debug obs_snn_var = L.get_output(l_obs_snn) self._l_obs_snn = ext.compile_function( inputs=[all_obs_var], outputs=obs_snn_var, ) # self._log_std = ext.compile_function( # inputs=[all_obs_var], # outputs=log_std_var, # ) self._mean = ext.compile_function( inputs=[all_obs_var], outputs=mean_var, ) self._f_dist = ext.compile_function( inputs=[all_obs_var], outputs=[mean_var, log_std_var], ) # if I want to monitor the selector output self._f_select = ext.compile_function( inputs=[all_obs_var], outputs=selection_var, )
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, sent_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, batch_norm, dropout, dropout_in, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {None: 0} self.ivocab = {0: None} self.word2vec = word2vec self.word_vector_size = word_vector_size self.sent_vector_size = sent_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.dropout_in = dropout_in self.max_inp_sent_len = 0 self.max_q_len = 0 """ #To Use All Vocab self.vocab = {None: 0, 'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0} self.ivocab = {0: None, 1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'} #""" self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.imatrix('input_var') self.q_var = T.ivector('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.attentions = [] self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len) self.pe_matrix_q = self.pe_matrix(self.max_q_len) print "==> building input module" #positional encoder weights self.W_pe = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) #biGRU input fusion weights self.W_inp_res_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_res_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.inp_sent_reps, _ = theano.scan(fn=self.sum_pos_encodings_in, sequences=self.input_var) self.inp_sent_reps_stacked = T.stacklists(self.inp_sent_reps) self.inp_c = self.input_module_full(self.inp_sent_reps) self.q_q = self.sum_pos_encodings_q(self.q_var) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 4 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # add conditional ending? dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [ self.W_pe, self.W_inp_res_in_fwd, self.W_inp_res_hid_fwd, self.b_inp_res_fwd, self.W_inp_upd_in_fwd, self.W_inp_upd_hid_fwd, self.b_inp_upd_fwd, self.W_inp_hid_in_fwd, self.W_inp_hid_hid_fwd, self.b_inp_hid_fwd, self.W_inp_res_in_bwd, self.W_inp_res_hid_bwd, self.b_inp_res_bwd, self.W_inp_upd_in_bwd, self.W_inp_upd_hid_bwd, self.b_inp_upd_bwd, self.W_inp_hid_in_bwd, self.W_inp_hid_hid_bwd, self.b_inp_hid_bwd, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adam(self.loss, self.params, learning_rate=0.0001, beta1=0.5) #from DCGAN paper self.attentions = T.stack(self.attentions) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss, self.attentions], updates=updates, on_unused_input='warn', allow_input_downcast=True) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss, self.attentions], on_unused_input='warn', allow_input_downcast=True)
dense_2 = DenseLayer(dense_1, num_units=n_input, nonlinearity=tanh) probs = DenseLayer(dense_2, num_units=n_output, nonlinearity=softmax) return probs X_state = T.fmatrix() X_action = T.bvector() X_reward = T.fvector() X_action_hot = to_one_hot(X_action, n_output) prob_values = policy_network(X_state) policy_ = get_output(prob_values) policy = theano.function(inputs=[X_state], outputs=policy_, allow_input_downcast=True) loss = categorical_crossentropy(policy_, X_action_hot) * X_reward loss = loss.mean() params = get_all_params(prob_values) updates = adam(loss, params, learning_rate=learning_rate) update_network = theano.function(inputs=[X_state, X_action, X_reward], outputs=loss, updates=updates, allow_input_downcast=True)
def create_network(available_actions_count): # Create the input variables s1 = tensor.tensor4("State") a = tensor.vector("Action", dtype="int32") q2 = tensor.vector("Q2") r = tensor.vector("Reward") isterminal = tensor.vector("IsTerminal", dtype="int8") # Create the input layer of the network. dqn = InputLayer(shape=[None, 1, resolution[0], resolution[1]], input_var=s1) # Add 2 convolutional layers with ReLu activation dqn = Conv2DLayer(dqn, num_filters=8, filter_size=[6, 6], nonlinearity=rectify, W=HeUniform("relu"), b=Constant(.1), stride=3) dqn = Conv2DLayer(dqn, num_filters=8, filter_size=[3, 3], nonlinearity=rectify, W=HeUniform("relu"), b=Constant(.1), stride=2) # Add a single fully-connected layer. dqn = DenseLayer(dqn, num_units=128, nonlinearity=rectify, W=HeUniform("relu"), b=Constant(.1)) # Add the output layer (also fully-connected). # (no nonlinearity as it is for approximating an arbitrary real function) dqn = DenseLayer(dqn, num_units=available_actions_count, nonlinearity=None) # Define the loss function q = get_output(dqn) # target differs from q only for the selected action. The following means: # target_Q(s,a) = r + gamma * max Q(s2,_) if isterminal else r target_q = tensor.set_subtensor( q[tensor.arange(q.shape[0]), a], r + discount_factor * (1 - isterminal) * q2) loss = squared_error(q, target_q).mean() # Update the parameters according to the computed gradient using RMSProp. params = get_all_params(dqn, trainable=True) updates = rmsprop(loss, params, learning_rate) # Compile the theano functions print("Compiling the network ...") function_learn = theano.function([s1, q2, a, r, isterminal], loss, updates=updates, name="learn_fn") function_get_q_values = theano.function([s1], q, name="eval_fn") function_get_best_action = theano.function([s1], tensor.argmax(q), name="test_fn") print("Network compiled.") def simple_get_best_action(state): return function_get_best_action( state.reshape([1, 1, resolution[0], resolution[1]])) # Returns Theano objects for the net and functions. return dqn, function_learn, function_get_q_values, simple_get_best_action
def test_memory_cells(batch_size=3, seq_len=50, input_dim=8, n_hidden=16): # lasagne way l_in = InputLayer( (None, seq_len, input_dim), input_var=theano.shared( np.random.normal(size=[batch_size, seq_len, input_dim])), name='input seq') l_lstm0 = LSTMLayer(l_in, n_hidden, name='lstm') l_gru0 = GRULayer(l_in, n_hidden, name='gru') f_predict0 = theano.function([], get_output([l_lstm0, l_gru0])) # agentnet way s_in = InputLayer((None, input_dim), name='in') s_prev_cell = InputLayer((None, n_hidden), name='cell') s_prev_hid = InputLayer((None, n_hidden), name='hid') s_lstm_cell, s_lstm_hid = LSTMCell(s_prev_cell, s_prev_hid, s_in, name='lstm') s_prev_gru = InputLayer((None, n_hidden), name='hid') s_gru = GRUCell(s_prev_gru, s_in, name='gru') rec = Recurrence(state_variables=OrderedDict({ s_lstm_cell: s_prev_cell, s_lstm_hid: s_prev_hid, s_gru: s_prev_gru }), input_sequences={s_in: l_in}, unroll_scan=False) state_seqs, _ = rec.get_sequence_layers() l_lstm1 = state_seqs[s_lstm_hid] l_gru1 = state_seqs[s_gru] f_predict1 = theano.function([], get_output([l_lstm1, l_gru1])) # lstm param transfer old_params = sorted(get_all_params(l_lstm0, trainable=True), key=lambda p: p.name) new_params = sorted(get_all_params(s_lstm_hid, trainable=True), key=lambda p: p.name) for old, new in zip(old_params, new_params): print old.name, '<-', new.name assert tuple(old.shape.eval()) == tuple(new.shape.eval()) old.set_value(new.get_value()) # gru param transfer old_params = sorted(get_all_params(l_gru0, trainable=True), key=lambda p: p.name) new_params = sorted(get_all_params(s_gru, trainable=True), key=lambda p: p.name) for old, new in zip(old_params, new_params): print old.name, '<-', new.name assert tuple(old.shape.eval()) == tuple(new.shape.eval()) old.set_value(new.get_value()) lstm0_out, gru0_out = f_predict0() lstm1_out, gru1_out = f_predict1() assert np.allclose(lstm0_out, lstm1_out) assert np.allclose(gru0_out, gru1_out)
def __init__( self, env_spec, latent_dim=2, latent_name='bernoulli', bilinear_integration=False, resample=False, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, min_std=1e-4, ): """ :param latent_dim: dimension of the latent variables :param latent_name: distribution of the latent variables :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever agent is reset, which can happen several times along the rollout with rollout in utils_snn) """ # for _ in range(10): # print("init!") # print("initilizaer run!") self.latent_dim = latent_dim ##could I avoid needing this self for the get_action? self.latent_name = latent_name self.bilinear_integration = bilinear_integration self.resample = resample self.min_std = min_std self.hidden_sizes = hidden_sizes self.pre_fix_latent = np.array( [] ) # if this is not empty when using reset() it will use this latent self.latent_fix = np.array( []) # this will hold the latents variable sampled in reset() self._set_std_to_0 = False if latent_name == 'normal': self.latent_dist = DiagonalGaussian(self.latent_dim) self.latent_dist_info = dict(mean=np.zeros(self.latent_dim), log_std=np.zeros(self.latent_dim)) elif latent_name == 'bernoulli': self.latent_dist = Bernoulli(self.latent_dim) self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim)) elif latent_name == 'categorical': self.latent_dist = Categorical(self.latent_dim) if self.latent_dim > 0: self.latent_dist_info = dict(prob=1. / self.latent_dim * np.ones(self.latent_dim)) else: self.latent_dist_info = dict(prob=np.ones(self.latent_dim)) else: raise NotImplementedError Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) if self.bilinear_integration: obs_dim = env_spec.observation_space.flat_dim + latent_dim +\ env_spec.observation_space.flat_dim * latent_dim else: obs_dim = env_spec.observation_space.flat_dim + latent_dim # here only if concat. action_dim = env_spec.action_space.flat_dim mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="meanMLP", ) l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if adaptive_std: l_log_std = MLP(input_shape=(obs_dim, ), input_var=obs_var, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, name="log_stdMLP").output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy_snn, self).__init__(env_spec) self._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], ) # Sy: load policy self._layers_mean = mean_network.layers l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if adaptive_std: log_std_network = MLP(input_shape=(obs_dim, ), input_var=obs_var, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, name="log_stdMLP") l_log_std = log_std_network.output_layer self._layers_log_std = log_std_network.layers else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self._layers_log_std = [l_log_std] self._layers_snn = self._layers_mean + self._layers_log_std # this returns a list with the "snn" layers
def __init__(self, train_raw, dev_raw, test_raw, word2vec, word_vector_size, answer_module, dim, mode, input_mask_mode, memory_hops, l2, normalize_attention, dropout, **kwargs): print "generate sentence answer for mctest" print "==> not used params in DMN class:", kwargs.keys() self.word2vec = word2vec self.word_vector_size = word_vector_size # add eng_of_sentence tag for answer generation #self.end_tag = len(word2vec) #self.vocab_size = self.end_tag+1 self.vocab_size = len(word2vec) self.dim = dim # hidden state size self.mode = mode self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.answer_module = answer_module self.l2 = l2 self.normalize_attention = normalize_attention self.dropout = dropout self.train_input, self.train_q, self.train_answer, self.train_input_mask, self.train_max_n = self._process_input( train_raw) self.dev_input, self.dev_q, self.dev_answer, self.dev_input_mask, self.dev_max_n = self._process_input( dev_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask, self.test_max_n = self._process_input( test_raw) self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.ivector('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.max_n = T.iscalar('max_n') self.attentions = [] print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] self.attentions = T.stack(self.attentions) print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) self.prediction = self.prediction.dimshuffle('x', 0) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [ a, y ] #, theano.scan_module.until(n>=max_n)) # or argmax==self.end_tag) dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=self.max_n) self.prediction = results[1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': #feedforward': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction, self.answer_var).sum() if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adam(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.max_n ], allow_input_downcast=True, outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.max_n ], allow_input_downcast=True, outputs=[self.prediction, self.loss, self.attentions])
def compile_theano_functions(self, data_type='2D'): assert self.net != None ### symbolic theano input theano_args = OrderedDict() dim = len(self.cf.dim) if data_type == '2D': assert dim == 2 theano_args['X'] = T.tensor4() theano_args['y'] = T.dmatrix() self.logger.info('Net: Working with 2D data.') elif data_type == '3D': assert dim == 3 theano_args['X'] = T.tensor5() theano_args['y'] = T.ivector() self.logger.info('Net: Working with 3D data.') val_args = deepcopy(theano_args) train_args = deepcopy(theano_args) train_args['lr'] = T.scalar(name='lr') ### prediction functions # get softmax prediction of shape (b, classes) prediction_train = get_output(self.net[self.cf.out_layer], train_args['X'], deterministic=False) prediction_val = get_output(self.net[self.cf.out_layer], val_args['X'], deterministic=True) self.predict['train'] = theano.function([train_args['X']], prediction_train) self.predict['val'] = theano.function([val_args['X']], prediction_val) ### l2 loss self.loss['train'] = squared_error(prediction_train, train_args['y']).mean() self.loss['val'] = squared_error(prediction_val, val_args['y']).mean() if self.cf.use_weight_decay: training_loss = self.loss['train'] +\ self.cf.weight_decay * lasagne.regularization.regularize_network_params(self.net[self.cf.out_layer], lasagne.regularization.l2) self.logger.info('Net: Using weight decay of {}.'.format( self.cf.weight_decay)) else: training_loss = self.loss['train'] ### accuracy # train_acc = T.mean(T.eq(T.argmax(prediction_train_smax, axis=1), train_args['y'])) # val_acc = T.mean(T.eq(T.argmax(prediction_val_smax, axis=1), val_args['y'])) ### training functions params = get_all_params(self.net[self.cf.out_layer], trainable=True) grads = theano.grad(training_loss, params) updates = adam(grads, params, learning_rate=train_args['lr']) self.train_fn = theano.function(train_args.values(), [self.loss['train'], prediction_train], updates=updates) self.val_fn = theano.function(val_args.values(), [self.loss['val'], prediction_val]) self.logger.info('Net: Compiled theano functions.')
def main(n=5, k=12, num_epochs=50, model=None): # Check if cifar data exists print("n= ", n, " k= ", k) if not os.path.exists("./cifar-10-batches-py"): print( "CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'." ) return # Load the dataset print("Loading data...") data = load_data() X_train = data['X_train'] Y_train = data['Y_train'] X_test = data['X_test'] Y_test = data['Y_test'] # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model print("Building model and compiling functions...") network = build_cnn(input_var, n, k) print("number of parameters in model: %d" % lasagne.layers.count_params(network, trainable=True)) if model is None: # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy( prediction, target_var) loss = loss.mean() # add weight decay all_layers = lasagne.layers.get_all_layers(network) l2_penalty = lasagne.regularization.regularize_layer_params( all_layers, lasagne.regularization.l2) * 0.0001 loss = loss + l2_penalty # Create update expressions for training # Stochastic Gradient Descent (SGD) with momentum params = lasagne.layers.get_all_params(network, trainable=True) lr = 0.1 sh_lr = theano.shared(lasagne.utils.floatX(lr)) updates = lasagne.updates.momentum(loss, params, learning_rate=sh_lr, momentum=0.9) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Create a loss expression for validation/testing test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) ###function for prediction predict_class = theano.function(inputs=[input_var], outputs=test_prediction) ####function to generate the related hash-code for the images layers = lasagne.layers.get_all_layers(network) for l in layers: if l.name == 'hash_layer': hash_layer_out = l prediction_hash = T.switch( T.le(get_output(hash_layer_out, input_var), 0.5), 0., 1.) predict_hash = theano.function([input_var], prediction_hash) #####################1. TRAINING OR LOADING THE MODEL ######################## if model is None: validation_loss = 10 # launch the training loop print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # shuffle training data train_indices = np.arange(100000) np.random.shuffle(train_indices) X_train = X_train[train_indices, :, :, :] Y_train = Y_train[train_indices] # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, Y_train, 128, shuffle=True, augment=True): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) if val_err / val_batches < validation_loss: validation_loss = val_err / val_batches np.savez( 'cifar10_deep_residual_hashing_n' + str(n) + '_k' + str(k) + '.npz', *lasagne.layers.get_all_param_values(network)) print("guardando modelo...") print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) # adjust learning rate as in paper # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs if (epoch + 1) == 41 or (epoch + 1) == 61: new_lr = sh_lr.get_value() * 0.1 print("New LR:" + str(new_lr)) sh_lr.set_value(lasagne.utils.floatX(new_lr)) # dump the network weights to a file : #np.savez('cifar10_deep_residual_model.npz', *lasagne.layers.get_all_param_values(network)) else: # load network weights from model file print('Loading MODEL pre-trained') with np.load(model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, param_values) validation_of_the_model() # Calculate validation error of model: def validation_of_the_model(): test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100)) #####################2. GENERATION OF THE CODES ########################## def save_obj(obj, name): with open(name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(name): with open(name + '.pkl', 'rb') as f: return pickle.load(f) hashes_training = [] index = 0 #generating codes for training images for batch in iterate_minibatches(X_train, Y_train, 500, shuffle=False): inputs, targets = batch pred = predict_hash(inputs).astype(int) for element in pred: hashes_training.append((index, element, Y_train[index])) index += 1 save_obj(hashes_training, 'cifar10_n' + str(n) + '_hash' + str(k) + 'k_codes') hashes_testing = [] index = 0 #generating codes for testing images for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch pred = predict_hash(inputs).astype(int) for element in pred: hashes_testing.append((index, element, Y_test[index])) index += 1 save_obj(hashes_testing, 'cifar10_n' + str(n) + '_hash' + str(k) + 'k_test_codes') ########################3. MAP evaluation ############################# #for all elements in test_examples (should be a list of len=10000) #MAP evaluation as shown in https://github.com/kevinlin311tw/caffe-cvprw15 k_ = 1000 NS = np.arange(1, k_ + 1) sum_tp = np.zeros(len(NS)) QueryTimes = 10000 AP = np.zeros(QueryTimes) index_of_query = 0 for image_test in hashes_testing: for index in range(len(hashes_training)): hashes_training[index] = ( hashes_training[index][0], hashes_training[index][1], hashes_training[index][2], np.count_nonzero(image_test[1] != hashes_training[index][1]) ) #hamming2(image_test[1],hashes_training[index][1])) from operator import itemgetter hashes_training.sort(key=itemgetter(3)) #comenzamos buffer_yes = np.zeros(k_) total_relevant = 0 for i in range(k_): #si la etiqueta es igual, sumar if hashes_training[i][2] == image_test[2]: buffer_yes[i] = 1 total_relevant += 1 #print (total_relevant) P = np.divide(np.cumsum(buffer_yes), NS, dtype=float) if np.sum(buffer_yes, axis=0) == 0: AP[index_of_query] = 0 else: AP[index_of_query] = np.sum(np.multiply(P, buffer_yes), axis=0) / np.sum(buffer_yes, axis=0) #print (index_of_query, AP[index_of_query]) sum_tp = sum_tp + np.cumsum(buffer_yes) index_of_query += 1 precision_at_k = np.divide(sum_tp, NS * QueryTimes) map_ = np.mean(AP) print('precision_at_k', precision_at_k) #array de valores save_obj(precision_at_k, 'precision_at_k_n' + str(n) + '_k' + str(k)) print('map', map_) #valor numerico save_obj(map_, 'map_n' + str(n) + '_k' + str(k)) print('n' + str(n) + 'k' + str(k))
# fixed random seeds rng_data = np.random.RandomState(args.seed_data) rng = np.random.RandomState(args.seed) theano_rng = MRG_RandomStreams(rng.randint(2**15)) lasagne.random.set_rng(np.random.RandomState(rng.randint(2**15))) # load CIFAR-10 test_matched, test_unmatched = get_data_patches_test(args.test_data, args.data_dir) trainx = get_data_patches_training(args.data_name, args.data_dir) trainx = trainx[rng.permutation(trainx.shape[0])] # specify generative model gen_layers = get_generator(args.batch_size, theano_rng) gen_dat = ll.get_output(gen_layers[-1]) # specify discriminative model disc_layers, f_low_dim, _ = get_discriminator_brown(args.num_features) load_model(gen_layers, args.generator_out) load_model(disc_layers, args.discriminator_out) x_temp = T.tensor4() # Test generator in sampling procedure samplefun = th.function(inputs=[], outputs=gen_dat) sample_x = [] for k in range(20): sample_x.append(samplefun()) sample_x = np.concatenate(sample_x, axis=0)
def __init__(self, env_spec, hidden_dim=32, feature_network=None, state_include_action=True, hidden_nonlinearity=NL.tanh): """ :param env_spec: A spec for the env. :param hidden_dim: dimension of hidden layer :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ assert isinstance(env_spec.action_space, Discrete) Serializable.quick_init(self, locals()) super(CategoricalGRUPolicy, self).__init__(env_spec) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim if state_include_action: input_dim = obs_dim + action_dim else: input_dim = obs_dim l_input = L.InputLayer(shape=(None, None, input_dim), name="input") if feature_network is None: feature_dim = input_dim l_flat_feature = None l_feature = l_input else: feature_dim = feature_network.output_layer.output_shape[-1] l_flat_feature = feature_network.output_layer l_feature = OpLayer( l_flat_feature, extras=[l_input], name="reshape_feature", op=lambda flat_feature, input: TT.reshape( flat_feature, [input.shape[0], input.shape[1], feature_dim]), shape_op=lambda _, input_shape: (input_shape[0], input_shape[1], feature_dim)) prob_network = GRUNetwork(input_shape=(feature_dim, ), input_layer=l_feature, output_dim=env_spec.action_space.n, hidden_dim=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=TT.nnet.softmax, name="prob_network") self.prob_network = prob_network self.feature_network = feature_network self.l_input = l_input self.state_include_action = state_include_action flat_input_var = TT.matrix("flat_input") if feature_network is None: feature_var = flat_input_var else: feature_var = L.get_output( l_flat_feature, {feature_network.input_layer: flat_input_var}) self.f_step_prob = ext.compile_function( [flat_input_var, prob_network.step_prev_hidden_layer.input_var], L.get_output([ prob_network.step_output_layer, prob_network.step_hidden_layer ], {prob_network.step_input_layer: feature_var})) self.input_dim = input_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.prev_action = None self.prev_hidden = None self.dist = RecurrentCategorical(env_spec.action_space.n) out_layers = [prob_network.output_layer] if feature_network is not None: out_layers.append(feature_network.output_layer) LasagnePowered.__init__(self, out_layers)
ffn3 = icnn3 ffn4 = LL.ConcatLayer([inp,ffn1,ffn2,ffn3],axis=1, cropping=None); ffn = LL.DenseLayer(ffn4, nclasses, nonlinearity=utils_lasagne.log_softmax) return ffn inp = LL.InputLayer(shape=(None, nin)) patch_op = LL.InputLayer(input_var=Tsp.csc_fmatrix('patch_op'), shape=(None, None)) print(patch_op.shape[0]) ffn = get_model(inp, patch_op) output = LL.get_output(ffn) pred = LL.get_output(ffn, deterministic=True) target = T.ivector('idxs') cla = utils_lasagne.categorical_crossentropy_logdomain(output, target, nclasses).mean() acc = LO.categorical_accuracy(pred, target).mean() regL2 = L.regularization.regularize_network_params(ffn, L.regularization.l2) cost = cla + l2_weight * regL2 params = LL.get_all_params(ffn, trainable=True) grads = T.grad(cost, params) grads_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grads]), 2) updates = L.updates.adam(grads, params, learning_rate=0.001) funcs = dict()
nb_valid_batch = 4 batch_size = 5 ###################### # Building the model # ###################### # Symbolic variables x = T.tensor4('x', dtype=theano.config.floatX) # Creating the model model = build_model2(input_var=x) with np.load(data_path + 'best_cnn_model.npz') as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] layers.set_all_param_values(model, param_values) output = layers.get_output(model, deterministic=True) # Creating theano function predict_target = theano.function( [x], output, allow_input_downcast=True, ) ###################### # Predict the target # ###################### for i in range(nb_valid_batch): input, target = get_image(data_path, valid_input_path, valid_target_path, str(i))
def dist_info_sym(self, obs_var, state_info_var=None): mean_var, log_std_var = L.get_output([self._l_mean, self._l_log_std], obs_var) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) return dict(mean=mean_var, log_std=log_std_var)
def __init__(self, input_vars, target_vars, l_out, loss, optimizer, learning_rate=0.001, id=None): if not isinstance(input_vars, Sequence): raise ValueError( 'input_vars should be a sequence, instead got %s' % (input_vars, )) if not isinstance(target_vars, Sequence): raise ValueError( 'target_vars should be a sequence, instead got %s' % (input_vars, )) self.get_options() self.input_vars = input_vars self.l_out = l_out self.loss = loss self.optimizer = optimizer self.id = id id_tag = (self.id + '/') if self.id else '' id_tag_log = (self.id + ': ') if self.id else '' if self.options.verbosity >= 6: output_model_structure(l_out) params = self.params() (monitored, train_loss_grads, synth_vars) = self.get_train_loss(target_vars, params) self.monitored_tags = monitored.keys() if self.options.true_grad_clipping: scaled_grads = total_norm_constraint( train_loss_grads, self.options.true_grad_clipping) else: scaled_grads = train_loss_grads updates = optimizer(scaled_grads, params, learning_rate=learning_rate) self.optimizer_vars = [var for var in updates if var not in params] if not self.options.no_nan_suppression: # TODO: print_mode='all' somehow is always printing, even when # there are no NaNs. But tests are passing, even on GPU! updates = apply_nan_suppression(updates, print_mode='none') if self.options.detect_nans: mode = MonitorMode(post_func=detect_nan) else: mode = None if self.options.verbosity >= 2: print(id_tag_log + 'Compiling training function') params = input_vars + target_vars + synth_vars if self.options.verbosity >= 6: print('params = %s' % (params, )) self.train_fn = theano.function(params, monitored.values(), updates=updates, mode=mode, name=id_tag + 'train', on_unused_input='warn') if self.options.run_dir and not self.options.no_graphviz: self.visualize_graphs({'loss': monitored['loss']}, out_dir=self.options.run_dir) test_prediction = get_output(l_out, deterministic=True) if self.options.verbosity >= 2: print(id_tag_log + 'Compiling prediction function') if self.options.verbosity >= 6: print('params = %s' % (input_vars, )) self.predict_fn = theano.function(input_vars, test_prediction, mode=mode, name=id_tag + 'predict', on_unused_input='ignore') if self.options.run_dir and not self.options.no_graphviz: self.visualize_graphs({'test_prediction': test_prediction}, out_dir=self.options.run_dir)
def __init__( self, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, name=None, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer() else: optimizer = LbfgsOptimizer() self.output_dim = output_dim self._optimizer = optimizer if prob_network is None: prob_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_prob = prob_network.output_layer LasagnePowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = TT.imatrix("ys") old_prob_var = TT.matrix("old_prob") x_mean_var = theano.shared(np.zeros((1, ) + input_shape), name="x_mean", broadcastable=(True, ) + (False, ) * len(input_shape)) x_std_var = theano.shared(np.ones((1, ) + input_shape), name="x_std", broadcastable=(True, ) + (False, ) * len(input_shape)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars)) loss = -TT.mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim) self._f_predict = ext.compile_function([xs_var], predicted) self._f_prob = ext.compile_function([xs_var], prob_var) self._prob_network = prob_network self._l_prob = l_prob optimizer_args = dict( loss=loss, target=self, network_outputs=[prob_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._x_mean_var = x_mean_var self._x_std_var = x_std_var
gen_layers.append( nn.batch_norm(LL.DenseLayer(gen_layers[-1], num_units=500, nonlinearity=T.nnet.softplus), g=None)) gen_layers.append( nn.batch_norm(LL.DenseLayer(gen_layers[-1], num_units=500, nonlinearity=T.nnet.softplus), g=None)) gen_layers.append( nn.l2normalize( LL.DenseLayer(gen_layers[-1], num_units=28**2, nonlinearity=T.nnet.sigmoid))) gen_dat = LL.get_output(gen_layers[-1], deterministic=False) # specify supervised model layers = [LL.InputLayer(shape=(None, 28**2))] layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.3)) layers.append(nn.DenseLayer(layers[-1], num_units=1000)) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(nn.DenseLayer(layers[-1], num_units=500)) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(nn.DenseLayer(layers[-1], num_units=250)) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(nn.DenseLayer(layers[-1], num_units=250)) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(nn.DenseLayer(layers[-1], num_units=250)) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(
def define_model(input_var, **kwargs): """ Defines the model and returns (network, validation network output) -Return layers.get_output(final_layer_name) if validation network output and train network output are the same -For example, return layers.get_output(final_layer_name, deterministic = true) if there is a dropout layer -Use **kwargs to pass model specific parameters """ conv1_filter_count = 100 conv1_filter_size = 5 pool1_size = 2 n_dense_units = 3000 batch_size = input_var.shape[0] image_size = 32 after_conv1 = image_size after_pool1 = (after_conv1 + pool1_size - 1) // pool1_size input = layers.InputLayer(shape=(None, 3, image_size, image_size), input_var=input_var) greyscale_input = our_layers.GreyscaleLayer( incoming=input, random_greyscale=True, ) conv1 = layers.Conv2DLayer( incoming=greyscale_input, num_filters=conv1_filter_count, filter_size=conv1_filter_size, stride=1, pad='same', nonlinearity=lasagne.nonlinearities.sigmoid, ) pool1 = layers.MaxPool2DLayer( incoming=conv1, pool_size=pool1_size, stride=pool1_size, ) dense1 = layers.DenseLayer( incoming=pool1, num_units=n_dense_units, nonlinearity=lasagne.nonlinearities.rectify, ) pre_unpool1 = layers.DenseLayer( incoming=dense1, num_units=conv1_filter_count * (after_pool1**2), nonlinearity=lasagne.nonlinearities.linear, ) pre_unpool1 = layers.ReshapeLayer( incoming=pre_unpool1, shape=(batch_size, conv1_filter_count) + (after_pool1, after_pool1), ) unpool1 = our_layers.Unpool2DLayer( incoming=pre_unpool1, kernel_size=pool1_size, ) deconv1 = layers.Conv2DLayer( incoming=unpool1, num_filters=3, filter_size=conv1_filter_size, stride=1, pad='same', nonlinearity=lasagne.nonlinearities.sigmoid, ) output = layers.ReshapeLayer(incoming=deconv1, shape=input_var.shape) return (output, layers.get_output(output))
def __init__(self, feature_shape, latent_size, hidden_structure, reconstruction_distribution=None, number_of_reconstruction_classes=None, use_count_sum=False, use_batch_norm=False): self.use_count_sum = use_count_sum and \ (reconstruction_distribution != "bernoulli") # Add warm-up to the model, weighting the KL-term gradually higher for each epoch self.use_batch_norm = use_batch_norm print("Setting up model.") print(" feature size: {}".format(feature_shape)) print(" latent size: {}".format(latent_size)) print(" hidden structure: {}".format(", ".join( map(str, hidden_structure)))) if type(reconstruction_distribution) == str: print(" reconstruction distribution: " + reconstruction_distribution) else: print(" reconstruction distribution: custom") if number_of_reconstruction_classes > 0: print( " reconstruction classes: {}".format( number_of_reconstruction_classes), " (including 0s)") if self.use_count_sum: print(" using count sums") if self.use_batch_norm: print(" using batch normalisation of each layer.") print("") # Setup super(VariationalAutoEncoderForCounts, self).__init__() self.feature_shape = feature_shape self.latent_size = latent_size self.hidden_structure = hidden_structure symbolic_x = T.matrix('x') # counts symbolic_z = T.matrix('z') # latent variable self.number_of_epochs_trained = 0 symbolic_learning_rate = T.scalar("epsilon") symbolic_warm_up_weight = T.scalar("beta") self.learning_curves = { "training": { "LB": [], "ENRE": [], "KL": [], "KL_all": [] }, "validation": { "LB": [], "ENRE": [], "KL": [] } } if reconstruction_distribution: if type(reconstruction_distribution) == str: if number_of_reconstruction_classes > 0: reconstruction_distribution = "softmax_" + \ reconstruction_distribution self.k_max = number_of_reconstruction_classes - 1 reconstruction_distribution = \ reconstruction_distributions[reconstruction_distribution] reconstruction_distribution = \ reconstruction_distribution(self.k_max) else: reconstruction_distribution = \ reconstruction_distributions[reconstruction_distribution] self.x_parameters = reconstruction_distribution["parameters"] self.reconstruction_activation_functions = \ reconstruction_distribution["activation functions"] self.expectedNegativeReconstructionError = \ reconstruction_distribution["function"] self.meanOfReconstructionDistribution = reconstruction_distribution[ "mean"] self.preprocess = reconstruction_distribution["preprocess"] else: reconstruction_distribution = "Gaussian (default)" # Use a Gaussian distribution as standard self.x_parameters = ["mu", "sigma"] self.reconstruction_activation_functions = { "mu": identity, "sigma": identity } self.expectedNegativeReconstructionError = lambda x, x_theta, eps = 0.0: \ log_normal(x, x_theta["mu"], x_theta["sigma"], eps) self.meanOfReconstructionDistribution = lambda x_theta: x_theta[ "mu"] self.preprocess = lambda x: x # if number_of_reconstruction_classes > 0: # # self.x_parameters += ["p_k"] # self.reconstruction_activation_functions["p_k"] = softmax # log_distribution = self.expectedNegativeReconstructionError # self.expectedNegativeReconstructionError = lambda x, x_theta, eps = 0.0: \ # log_cross_entropy_extended(x, x_theta, # log_distribution, k_max = number_of_reconstruction_classes - 1, # eps = 0.0) # mean_of_distribution = self.meanOfReconstructionDistribution # self.meanOfReconstructionDistribution = lambda x_theta: \ # meanOfCrossEntropyExtendedDistibution(x_theta, # mean_of_distribution, k_max = number_of_reconstruction_classes - 1) # self.k_max = number_of_reconstruction_classes - 1 if self.use_count_sum: symbolic_n = T.matrix('n') # sum of counts # Models ## Recognition model q(z|x) l_enc_in = InputLayer(shape=(None, feature_shape), name="ENC_INPUT") l_enc = l_enc_in for i, hidden_size in enumerate(hidden_structure): l_enc = DenseLayer(l_enc, num_units=hidden_size, nonlinearity=rectify, name='ENC_DENSE{:d}'.format(i + 1)) if self.use_batch_norm: l_enc = batch_norm(l_enc) if self.use_batch_norm: l_z_mu = batch_norm( DenseLayer(l_enc, num_units=latent_size, nonlinearity=None, name='ENC_Z_MU')) l_z_log_var = batch_norm( DenseLayer(l_enc, num_units=latent_size, nonlinearity=lambda x: T.clip(x, -10, 10), name='ENC_Z_LOG_VAR')) else: l_z_mu = DenseLayer(l_enc, num_units=latent_size, nonlinearity=None, name='ENC_Z_MU') l_z_log_var = DenseLayer(l_enc, num_units=latent_size, nonlinearity=lambda x: T.clip(x, -10, 10), name='ENC_Z_LOG_VAR') # Sample a latent representation z \sim q(z|x) = N(mu(x), logvar(x)) l_z = SimpleSampleLayer(mean=l_z_mu, log_var=l_z_log_var, name="ENC_SAMPLE") self.encoder = l_z ## Generative model p(x|z) l_dec_z_in = InputLayer(shape=(None, latent_size), name="DEC_INPUT") if self.use_count_sum: l_dec_n_in = InputLayer(shape=(None, 1), name="DEC_N_INPUT") l_dec = ConcatLayer([l_dec_z_in, l_dec_n_in], axis=1, name="DEC_MERGE_INPUT") else: l_dec = l_dec_z_in for i, hidden_size in enumerate(reversed(hidden_structure)): if self.use_batch_norm: l_dec = batch_norm( DenseLayer( l_dec, num_units=hidden_size, nonlinearity=rectify, name='DEC_DENSE{:d}'.format(len(hidden_structure) - i))) else: l_dec = DenseLayer( l_dec, num_units=hidden_size, nonlinearity=rectify, name='DEC_DENSE{:d}'.format(len(hidden_structure) - i)) l_x_theta = {} for p in self.x_parameters: p_name = 'DEC_X_' + p.upper() if self.reconstruction_activation_functions[p] == softmax: if self.use_batch_norm: l_dense = batch_norm( DenseLayer(l_dec, num_units=feature_shape * (self.k_max + 1), nonlinearity=identity, name=p_name + "_DENSE")) else: l_dense = DenseLayer(l_dec, num_units=feature_shape * (self.k_max + 1), nonlinearity=identity, name=p_name + "_DENSE") l_reshape = ReshapeLayer(l_dense, (-1, (self.k_max + 1))) if self.use_batch_norm: l_softmax = batch_norm( DenseLayer(l_reshape, num_units=(self.k_max + 1), nonlinearity=softmax, name=p_name + "_SOFTMAX")) else: l_softmax = DenseLayer(l_reshape, num_units=(self.k_max + 1), nonlinearity=softmax, name=p_name + "_SOFTMAX") l_x_theta[p] = ReshapeLayer(l_softmax, (-1, feature_shape, (self.k_max + 1))) else: if self.use_batch_norm: l_x_theta[p] = batch_norm( DenseLayer(l_dec, num_units=feature_shape, nonlinearity=self. reconstruction_activation_functions[p], name=p_name)) else: l_x_theta[p] = DenseLayer( l_dec, num_units=feature_shape, nonlinearity=self. reconstruction_activation_functions[p], name=p_name) self.decoder = {p: l_x_theta[p] for p in self.x_parameters} ## Get outputs from models ## Training outputs z_train, z_mu_train, z_log_var_train = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_in: symbolic_x}, deterministic=False) inputs = {l_dec_z_in: z_train} if self.use_count_sum: inputs[l_dec_n_in] = symbolic_n x_theta_train = get_output([l_x_theta[p] for p in self.x_parameters], inputs, deterministic=False) x_theta_train = { p: o for p, o in zip(self.x_parameters, x_theta_train) } ## Evaluation outputs z_eval, z_mu_eval, z_log_var_eval = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_in: symbolic_x}, deterministic=True) inputs = {l_dec_z_in: z_eval} if self.use_count_sum: inputs[l_dec_n_in] = symbolic_n x_theta_eval = get_output([l_x_theta[p] for p in self.x_parameters], inputs, deterministic=True) x_theta_eval = {p: o for p, o in zip(self.x_parameters, x_theta_eval)} ## Sample outputs inputs = {l_dec_z_in: symbolic_z} if self.use_count_sum: inputs[l_dec_n_in] = symbolic_n x_theta_sample = get_output([l_x_theta[p] for p in self.x_parameters], inputs, deterministic=True) x_theta_sample = { p: o for p, o in zip(self.x_parameters, x_theta_sample) } # Likelihood lower_bound_train, log_p_x_train, KL__train, KL__train_all = \ self.lowerBound(symbolic_x, x_theta_train, z_mu_train, z_log_var_train, beta=symbolic_warm_up_weight) lower_bound_eval, log_p_x_eval, KL__eval, KL__eval_all = \ self.lowerBound(symbolic_x, x_theta_eval, z_mu_eval, z_log_var_eval) all_parameters = get_all_params( [l_z] + [l_x_theta[p] for p in self.x_parameters], trainable=True) print("Parameters to train:") for parameter in all_parameters: print(" {}: {}".format(parameter, parameter.get_value().shape)) # Let Theano do its magic and get all the gradients we need for training all_gradients = T.grad(-lower_bound_train, all_parameters) # Set the update function for parameters. The Adam optimizer works really well with VAEs. update_expressions = updates.adam(all_gradients, all_parameters, learning_rate=symbolic_learning_rate) inputs = [symbolic_x] if self.use_count_sum: inputs.append(symbolic_n) inputs.append(symbolic_learning_rate) inputs.append(symbolic_warm_up_weight) self.f_train = theano.function(inputs=inputs, outputs=[ lower_bound_train, log_p_x_train, KL__train, KL__train_all ], updates=update_expressions) inputs = [symbolic_x] if self.use_count_sum: inputs.append(symbolic_n) self.f_eval = theano.function( inputs=inputs, outputs=[lower_bound_eval, log_p_x_eval, KL__eval, KL__eval_all]) self.f_z = theano.function(inputs=[symbolic_x], outputs=[z_eval]) inputs = [symbolic_z] if self.use_count_sum: inputs.append(symbolic_n) self.f_sample = theano.function( inputs=inputs, outputs=[x_theta_sample[p] for p in self.x_parameters]) inputs = [symbolic_x] if self.use_count_sum: inputs.append(symbolic_n) self.f_recon = theano.function( inputs=inputs, outputs=[x_theta_eval[p] for p in self.x_parameters])
def loss(test=False): return lasagne.objectives.categorical_crossentropy( get_output(network, deterministic=test), target_var).mean()
def train_model(learning_rate=0.0009, n_epochs=50, batch_size=200): ''' Function that compute the training of the model ''' ####################### # Loading the dataset # ####################### print ('... Loading data') # Load the dataset on the CPU data_path = get_path() train_input_path = 'train_input_' train_target_path = 'train_target_' valid_input_path = 'valid_input_' valid_target_path = 'valid_target_' nb_train_batch = 9 nb_valid_batch = 5 # Creating symbolic variables batch = 200 max_size = 25 min_train_size = 13 min_valid_size = 2 input_channel = 3 max_height = 64 max_width = 64 min_height = 32 min_width = 32 # Shape = (5000, 3, 64, 64) big_train_input = shared_GPU_data(shape=(batch * max_size, input_channel, max_height, max_width)) big_valid_input = shared_GPU_data(shape=(batch * max_size, input_channel, max_height, max_width)) # Shape = (5000, 3, 32, 32) big_train_target = shared_GPU_data(shape=(batch * max_size, input_channel, min_height, min_width)) big_valid_target = shared_GPU_data(shape=(batch * max_size, input_channel, min_height, min_width)) # Shape = (2600, 3, 64, 64) small_train_input = shared_GPU_data(shape=(batch * min_train_size, input_channel, max_height, max_width)) # Shape = (2600, 3, 32, 32) small_train_target = shared_GPU_data(shape=(batch * min_train_size, input_channel, min_height, min_width)) # Shape = (400, 3, 64, 64) small_valid_input = shared_GPU_data(shape=(batch * min_valid_size, input_channel, max_height, max_width)) # Shape = (400, 3, 32, 32) small_valid_target = shared_GPU_data(shape=(batch * min_valid_size, input_channel, min_height, min_width)) ###################### # Building the model # ###################### # Symbolic variables x = T.tensor4('x', dtype=theano.config.floatX) y = T.tensor4('y', dtype=theano.config.floatX) index = T.lscalar() # Creation of the model model = build_model2(input_var=x) output = layers.get_output(model, deterministic=True) params = layers.get_all_params(model, trainable=True) loss = T.mean(objectives.squared_error(output, y)) updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate) # Creation of theano functions train_big_model = theano.function([index], loss, updates=updates, allow_input_downcast=True, givens={x: big_train_input[index * batch_size: (index + 1) * batch_size], y: big_train_target[index * batch_size: (index + 1) * batch_size]}) train_small_model = theano.function([index], loss, updates=updates, allow_input_downcast=True, givens={x: small_train_input[index * batch_size: (index + 1) * batch_size], y: small_train_target[index * batch_size: (index + 1) * batch_size]}) big_valid_loss = theano.function([index], loss, allow_input_downcast=True, givens={x: big_valid_input[index * batch_size: (index + 1) * batch_size], y: big_valid_target[index * batch_size: (index + 1) * batch_size]}) small_valid_loss = theano.function([index], loss, allow_input_downcast=True, givens={x: small_valid_input[index * batch_size: (index + 1) * batch_size], y: small_valid_target[index * batch_size: (index + 1) * batch_size]}) idx = 50 # idx = index in this case pred_batch = 5 predict_target = theano.function([index], output, allow_input_downcast=True, givens={x: small_valid_input[index * pred_batch: (index + 1) * pred_batch]}) ################### # Train the model # ################### print('... Training') best_validation_loss = np.inf best_iter = 0 epoch = 0 # Valid images chosen when a better model is found batch_verification = 0 num_images = range(idx * pred_batch, (idx + 1) * pred_batch) start_time = timeit.default_timer() while (epoch < n_epochs): epoch = epoch + 1 n_train_batches = 0 for i in range(nb_train_batch): if i == (nb_train_batch - 1): # Shape = (2600, 3, 64, 64) & Shape = (2600, 3, 32, 32) input, target = get_image(data_path, train_input_path, train_target_path, str(i)) small_train_input.set_value(input) small_train_target.set_value(target) for j in range(min_train_size): cost = train_small_model(j) n_train_batches += 1 else: # Shape = (10000, 3, 64, 64) & Shape = (10000, 3, 32, 32) input, target = get_image(data_path, train_input_path, train_target_path, str(i)) big_train_input.set_value(input[0: batch * max_size]) big_train_target.set_value(target[0: batch * max_size]) for j in range(max_size): cost = train_big_model(j) n_train_batches += 1 big_train_input.set_value(input[batch * max_size:]) big_train_target.set_value(target[batch * max_size:]) for j in range(max_size): cost = train_big_model(j) n_train_batches += 1 validation_losses = [] for i in range(nb_valid_batch): if i == (nb_valid_batch - 1): # Shape = (400, 3, 64, 64) & Shape = (400, 3, 32, 32) input, target = get_image(data_path, valid_input_path, valid_target_path, str(i)) small_valid_input.set_value(input) small_valid_target.set_value(target) for j in range(min_valid_size): validation_losses.append(small_valid_loss(j)) else: # Shape = (10000, 3, 64, 64) & Shape = (10000, 3, 32, 32) input, target = get_image(data_path, valid_input_path, valid_target_path, str(i)) big_valid_input.set_value(input[0: batch * max_size]) big_valid_target.set_value(target[0: batch * max_size]) for j in range(max_size): validation_losses.append(big_valid_loss(j)) big_valid_input.set_value(input[batch * max_size:]) big_valid_target.set_value(target[batch * max_size:]) for j in range(max_size): validation_losses.append(big_valid_loss(j)) this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, n_train_batches, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = epoch # save the model and a bunch of valid pictures print ('... saving model and valid images') np.savez('best_cnn_model.npz', *layers.get_all_param_values(model)) # Shape = (10000, 3, 64, 64) & Shape = (10000, 3, 32, 32) input, target = get_image(data_path, valid_input_path, valid_target_path, str(batch_verification)) small_valid_input.set_value(input[0: batch * min_valid_size]) input = input[num_images] target = target[num_images] output = predict_target(idx) save_images(input=input, target=target, output=output, nbr_images=len(num_images), iteration=epoch) end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at epoch %i' % (best_validation_loss * 100., best_iter)) print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
def run_task(vv, log_dir=None, exp_name=None): global policy global baseline trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 # Check if variant is available if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']: raise ValueError('Unrecognized model type for simulating robot') if vv['robot_type'] not in ['MRZR', 'RCCar']: raise ValueError('Unrecognized robot type') # Load environment if not vv['use_ros']: env = StraightEnv(target_velocity=vv['target_velocity'], dt=vv['dt'], model_type=vv['model_type'], robot_type=vv['robot_type'], mu_s=vv['mu_s'], mu_k=vv['mu_k']) else: from aa_simulation.envs.straight.straight_env_ros import StraightEnvROS env = StraightEnvROS(target_velocity=vv['target_velocity'], dt=vv['dt'], model_type=vv['model_type'], robot_type=vv['robot_type']) # Save variant information for comparison plots variant_file = logger.get_snapshot_dir() + '/variant.json' logger.log_variant(variant_file, vv) # Set variance for each action component separately for exploration # Note: We set the variance manually because we are not scaling our # action space during training. init_std_speed = vv['target_velocity'] / 4 init_std_steer = np.pi / 6 init_std = [init_std_speed, init_std_steer] # Build policy and baseline networks # Note: Mean of policy network set to analytically computed values for # faster training (rough estimates for RL to fine-tune). if policy is None or baseline is None: target_velocity = vv['target_velocity'] target_steering = 0 output_mean = np.array([target_velocity, target_steering]) hidden_sizes = (32, 32) # In mean network, allow output b values to dominate final output # value by constraining the magnitude of the output W matrix. This is # to allow faster learning. These numbers are arbitrarily chosen. W_gain = min(vv['target_velocity'] / 5, np.pi / 15) mean_network = MLP(input_shape=(env.spec.observation_space.flat_dim, ), output_dim=env.spec.action_space.flat_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=LN.tanh, output_nonlinearity=None, output_W_init=LI.GlorotUniform(gain=W_gain), output_b_init=output_mean) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32), init_std=init_std, mean_network=mean_network) baseline = LinearFeatureBaseline(env_spec=env.spec, target_key='returns') # Reset variance to re-enable exploration when using pre-trained networks else: policy._l_log_std = ParamLayer( policy._mean_network.input_layer, num_units=env.spec.action_space.flat_dim, param=LI.Constant(np.log(init_std)), name='output_log_std', trainable=True) obs_var = policy._mean_network.input_layer.input_var mean_var, log_std_var = L.get_output( [policy._l_mean, policy._l_log_std]) policy._log_std_var = log_std_var LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std]) policy._f_dist = ext.compile_function(inputs=[obs_var], outputs=[mean_var, log_std_var]) safety_baseline = LinearFeatureBaseline(env_spec=env.spec, target_key='safety_returns') safety_constraint = StraightSafetyConstraint(max_value=1.0, baseline=safety_baseline) if vv['algo'] == 'TRPO': algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=600, max_path_length=env.horizon, n_itr=600, discount=0.99, step_size=trpo_stepsize, plot=False, ) else: algo = CPO(env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, batch_size=600, max_path_length=env.horizon, n_itr=600, discount=0.99, step_size=trpo_stepsize, gae_lambda=0.95, safety_gae_lambda=1, optimizer_args={'subsample_factor': trpo_subsample_factor}, plot=False) algo.train()
def __init__(self, stories, QAs, batch_size, story_v, learning_rate, word_vector_size, sent_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, story_source, normalize_attention, batch_norm, dropout, dropout_in, **kwargs): #print "==> not used params in DMN class:", kwargs.keys() self.learning_rate = learning_rate self.rng = np.random self.rng.seed(1234) mqa = MovieQA.DataLoader() ### Load Word2Vec model w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin') self.w2v = w2v_model self.d_w2v = len(w2v_model.get_vector(w2v_model.vocab[1])) self.word_thresh = 1 print "Loaded word2vec model: dim = %d | vocab-size = %d" % ( self.d_w2v, len(w2v_model.vocab)) ### Create vocabulary-to-index and index-to-vocabulary v2i = {'': 0, 'UNK': 1} # vocabulary to index QA_words, v2i = self.create_vocabulary( QAs, stories, v2i, w2v_vocab=w2v_model.vocab.tolist(), word_thresh=self.word_thresh) i2v = {v: k for k, v in v2i.iteritems()} self.vocab = v2i self.ivocab = i2v self.story_v = story_v self.word2vec = w2v_model self.word_vector_size = word_vector_size self.sent_vector_size = sent_vector_size self.dim = dim self.batch_size = batch_size self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.dropout_in = dropout_in #self.max_inp_sent_len = 0 #self.max_q_len = 0 ### Convert QAs and stories into numpy matrices (like in the bAbI data set) # storyM - Dictionary - indexed by imdb_key. Values are [num-sentence X max-num-words] # questionM - NP array - [num-question X max-num-words] # answerM - NP array - [num-question X num-answer-options X max-num-words] storyM, questionM, answerM = self.data_in_matrix_form( stories, QA_words, v2i) qinfo = self.associate_additional_QA_info(QAs) ### Split everything into train, val, and test data train_storyM = { k: v for k, v in storyM.iteritems() if k in mqa.data_split['train'] } val_storyM = { k: v for k, v in storyM.iteritems() if k in mqa.data_split['val'] } test_storyM = { k: v for k, v in storyM.iteritems() if k in mqa.data_split['test'] } def split_train_test(long_list, QAs, trnkey='train', tstkey='val'): # Create train/val/test splits based on key train_split = [ item for k, item in enumerate(long_list) if QAs[k].qid.startswith('train') ] val_split = [ item for k, item in enumerate(long_list) if QAs[k].qid.startswith('val') ] test_split = [ item for k, item in enumerate(long_list) if QAs[k].qid.startswith('test') ] if type(long_list) == np.ndarray: return np.array(train_split), np.array(val_split), np.array( test_split) else: return train_split, val_split, test_split train_questionM, val_questionM, test_questionM = split_train_test( questionM, QAs) train_answerM, val_answerM, test_answerM, = split_train_test( answerM, QAs) train_qinfo, val_qinfo, test_qinfo = split_train_test(qinfo, QAs) QA_train = [qa for qa in QAs if qa.qid.startswith('train:')] QA_val = [qa for qa in QAs if qa.qid.startswith('val:')] QA_test = [qa for qa in QAs if qa.qid.startswith('test:')] #train_data = {'s':train_storyM, 'q':train_questionM, 'a':train_answerM, 'qinfo':train_qinfo} #val_data = {'s':val_storyM, 'q':val_questionM, 'a':val_answerM, 'qinfo':val_qinfo} #test_data = {'s':test_storyM, 'q':test_questionM, 'a':test_answerM, 'qinfo':test_qinfo} with open('train_split.json') as fid: trdev = json.load(fid) s_key = self.story_v.keys() self.train_range = [ k for k, qi in enumerate(qinfo) if (qi['movie'] in trdev['train'] and qi['qid'] in s_key) ] self.train_val_range = [ k for k, qi in enumerate(qinfo) if (qi['movie'] in trdev['dev'] and qi['qid'] in s_key) ] self.val_range = [ k for k, qi in enumerate(val_qinfo) if qi['qid'] in s_key ] self.max_sent_len = max( [sty.shape[0] for sty in self.story_v.values()]) self.train_input = self.story_v self.train_val_input = self.story_v self.test_input = self.story_v self.train_q = train_questionM self.train_answer = train_answerM self.train_qinfo = train_qinfo self.train_val_q = train_questionM self.train_val_answer = train_answerM self.train_val_qinfo = train_qinfo self.test_q = val_questionM self.test_answer = val_answerM self.test_qinfo = val_qinfo """Setup some configuration parts of the model. """ self.v2i = v2i self.vs = len(v2i) self.d_lproj = 300 # define Look-Up-Table mask np_mask = np.vstack( (np.zeros(self.d_w2v), np.ones((self.vs - 1, self.d_w2v)))) T_mask = theano.shared(np_mask.astype(theano.config.floatX), name='LUT_mask') # setup Look-Up-Table to be Word2Vec self.pca_mat = None print "Initialize LUTs as word2vec and use linear projection layer" self.LUT = np.zeros((self.vs, self.d_w2v), dtype='float32') found_words = 0 for w, v in self.v2i.iteritems(): if w in self.w2v.vocab: # all valid words are already in vocab or 'UNK' self.LUT[v] = self.w2v.get_vector(w) found_words += 1 else: # LUT[v] = np.zeros((self.d_w2v)) self.LUT[v] = self.rng.randn(self.d_w2v) self.LUT[v] = self.LUT[v] / (np.linalg.norm(self.LUT[v]) + 1e-6) print "Found %d / %d words" % (found_words, len(self.v2i)) # word 0 is blanked out, word 1 is 'UNK' self.LUT[0] = np.zeros((self.d_w2v)) # if linear projection layer is not the same shape as LUT, then initialize with PCA if self.d_lproj != self.LUT.shape[1]: pca = PCA(n_components=self.d_lproj, whiten=True) self.pca_mat = pca.fit_transform(self.LUT.T) # 300 x 100? # setup LUT! self.T_w2v = theano.shared(self.LUT.astype(theano.config.floatX)) self.train_input_mask = np_mask self.test_input_mask = np_mask #self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(babi_train_raw) #self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.tensor3('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.tensor3('answer_var') self.input_mask_var = T.imatrix('input_mask_var') self.target = T.ivector('target') self.attentions = [] #self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len) #self.pe_matrix_q = self.pe_matrix(self.max_q_len) print "==> building input module" #positional encoder weights self.W_pe = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) #biGRU input fusion weights self.W_inp_res_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_res_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #self.V_f = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.V_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.inp_sent_reps = self.input_var #self.inp_sent_reps_stacked = T.stacklists(self.inp_sent_reps) #self.inp_c = self.input_module_full(self.inp_sent_reps_stacked) self.ans_reps = self.answer_var self.inp_c = self.input_module_full(self.inp_sent_reps) self.q_q = self.q_var print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, 4 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, 1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, 1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): self.mem_weight_num = int(iter - 1) current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in[self.mem_weight_num], self.W_mem_res_hid[self.mem_weight_num], self.b_mem_res[self.mem_weight_num], self.W_mem_upd_in[self.mem_weight_num], self.W_mem_upd_hid[self.mem_weight_num], self.b_mem_upd[self.mem_weight_num], self.W_mem_hid_in[self.mem_weight_num], self.W_mem_hid_hid[self.mem_weight_num], self.b_mem_hid[self.mem_weight_num])) last_mem_raw = memory[-1] net = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(300, self.dim)) if self.answer_module == 'feedforward': self.temp = T.dot(self.ans_reps, self.W_a) self.prediction = nn_utils.softmax(T.dot(self.temp, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # add conditional ending? dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [ self.W_pe, self.W_inp_res_in_fwd, self.W_inp_res_hid_fwd, self.b_inp_res_fwd, self.W_inp_upd_in_fwd, self.W_inp_upd_hid_fwd, self.b_inp_upd_fwd, self.W_inp_hid_in_fwd, self.W_inp_hid_hid_fwd, self.b_inp_hid_fwd, self.W_inp_res_in_bwd, self.W_inp_res_hid_bwd, self.b_inp_res_bwd, self.W_inp_upd_in_bwd, self.W_inp_upd_hid_bwd, self.b_inp_upd_bwd, self.W_inp_hid_in_bwd, self.W_inp_hid_hid_bwd, self.b_inp_hid_bwd, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction, self.target) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = T.mean(self.loss_ce) + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.adam(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params, learning_rate=self.learning_rate, beta1=0.5) #from DCGAN paper #updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate=0.0005) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) self.attentions = T.stack(self.attentions) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.target ], outputs=[self.prediction, self.loss, self.attentions], updates=updates, on_unused_input='warn', allow_input_downcast=True) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[self.input_var, self.q_var, self.answer_var, self.target], outputs=[self.prediction, self.loss, self.attentions], on_unused_input='warn', allow_input_downcast=True)
nonlinearity=nn.relu), g=None)) # 4 -> 8 gen_layers.append( nn.batch_norm(nn.Deconv2DLayer(gen_layers[-1], (args.batch_size, 128, 16, 16), (5, 5), W=Normal(0.05), nonlinearity=nn.relu), g=None)) # 8 -> 16 gen_layers.append( nn.weight_norm(nn.Deconv2DLayer(gen_layers[-1], (args.batch_size, 3, 32, 32), (5, 5), W=Normal(0.05), nonlinearity=T.tanh), train_g=True, init_stdv=0.1)) # 16 -> 32 gen_dat = ll.get_output(gen_layers[-1]) # specify discriminative model disc_layers = [ll.InputLayer(shape=(None, 3, 32, 32))] disc_layers.append(ll.DropoutLayer(disc_layers[-1], p=0.2)) disc_layers.append( nn.weight_norm( dnn.Conv2DDNNLayer(disc_layers[-1], 96, (3, 3), pad=1, W=Normal(0.05), nonlinearity=nn.lrelu))) disc_layers.append( nn.weight_norm( dnn.Conv2DDNNLayer(disc_layers[-1], 96, (3, 3),
def __init__( self, name, input_shape, output_dim, hidden_sizes, conv_filters,conv_filter_sizes,conv_strides,conv_pads, hidden_nonlinearity=NL.rectify, mean_network=None, optimizer=None, use_trust_region=True, step_size=0.01, subsample_factor=1.0, batchsize=None, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_conv_filters=[],std_conv_filters_sizes=[],std_conv_strides=[],std_conv_pads=[], std_hidden_sizes=(32, 32), std_nonlinearity=None, normalize_inputs=True, normalize_outputs=True, ): """ :param input_shape: usually for images of the form (width,height,channel) :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned. :param adaptive_std: Whether to make the std a function of the states. :param std_share_network: Whether to use the same network as the mean. :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if `std_share_network` is False. It defaults to the same architecture as the mean. :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network` is False. It defaults to the same non-linearity as the mean. """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer("optimizer") else: optimizer = LbfgsOptimizer("optimizer") self._optimizer = optimizer self.input_shape = input_shape if mean_network is None: mean_network = ConvNetwork( name="mean_network", input_shape=input_shape, output_dim=output_dim, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=None, ) l_mean = mean_network.output_layer if adaptive_std: l_log_std = ConvNetwork( name="log_std_network", input_shape=input_shape, input_var=mean_network.input_layer.input_var, output_dim=output_dim, conv_filters=std_conv_filters, conv_filter_sizes=std_conv_filter_sizes, conv_strides=std_conv_strides, conv_pads=std_conv_pads, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_nonlinearity, output_nonlinearity=None, ).output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=output_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) LasagnePowered.__init__(self, [l_mean, l_log_std]) xs_var = mean_network.input_layer.input_var ys_var = TT.matrix("ys") old_means_var = TT.matrix("old_means") old_log_stds_var = TT.matrix("old_log_stds") x_mean_var = theano.shared( np.zeros((1,np.prod(input_shape)), dtype=theano.config.floatX), name="x_mean", broadcastable=(True,False), ) x_std_var = theano.shared( np.ones((1,np.prod(input_shape)), dtype=theano.config.floatX), name="x_std", broadcastable=(True,False), ) y_mean_var = theano.shared( np.zeros((1, output_dim), dtype=theano.config.floatX), name="y_mean", broadcastable=(True, False) ) y_std_var = theano.shared( np.ones((1, output_dim), dtype=theano.config.floatX), name="y_std", broadcastable=(True, False) ) normalized_xs_var = (xs_var - x_mean_var) / x_std_var normalized_ys_var = (ys_var - y_mean_var) / y_std_var normalized_means_var = L.get_output( l_mean, {mean_network.input_layer: normalized_xs_var}) normalized_log_stds_var = L.get_output( l_log_std, {mean_network.input_layer: normalized_xs_var}) means_var = normalized_means_var * y_std_var + y_mean_var log_stds_var = normalized_log_stds_var + TT.log(y_std_var) normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var) dist = self._dist = DiagonalGaussian(output_dim) normalized_dist_info_vars = dict( mean=normalized_means_var, log_std=normalized_log_stds_var) mean_kl = TT.mean(dist.kl_sym( dict(mean=normalized_old_means_var, log_std=normalized_old_log_stds_var), normalized_dist_info_vars, )) loss = - \ TT.mean(dist.log_likelihood_sym( normalized_ys_var, normalized_dist_info_vars)) self._f_predict = compile_function([xs_var], means_var) self._f_pdists = compile_function([xs_var], [means_var, log_stds_var]) self._l_mean = l_mean self._l_log_std = l_log_std optimizer_args = dict( loss=loss, target=self, network_outputs=[normalized_means_var, normalized_log_stds_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [ xs_var, ys_var, old_means_var, old_log_stds_var] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs self._mean_network = mean_network self._x_mean_var = x_mean_var self._x_std_var = x_std_var self._y_mean_var = y_mean_var self._y_std_var = y_std_var self._subsample_factor = subsample_factor self._batchsize = batchsize