def add_shallow_conv_maxpool(network): regularization = 0 filter_size = (3, 3, 3) network = lasagne.layers.dnn.Conv3DDNNLayer( incoming=network, pad='same', num_filters=32, filter_size=filter_size, nonlinearity=lasagne.nonlinearities.leaky_rectify) l2_penalty = regularize_layer_params_weighted({network: 0.2}, l2) regularization += l2_penalty network = lasagne.layers.dnn.MaxPool3DDNNLayer(incoming=network, pool_size=(2, 2, 2), stride=2) network = lasagne.layers.dnn.Conv3DDNNLayer( incoming=network, pad='same', num_filters=64, filter_size=filter_size, nonlinearity=lasagne.nonlinearities.leaky_rectify) l2_penalty = regularize_layer_params_weighted({network: 0.2}, l2) regularization += l2_penalty network = lasagne.layers.dnn.MaxPool3DDNNLayer(incoming=network, pool_size=(2, 2, 2), stride=2) return network, regularization
def build_layer(self, model, all_l1_regs, all_l2_regs): model = DenseLayer(model, num_units=self.n_hidden, nonlinearity=utils.get_non_linearity(self.non_linearity)) if self.l1_reg != 0: all_l1_regs += regularize_layer_params_weighted({model: self.l1_reg}, l1) if self.l2_reg != 0: all_l2_regs += regularize_layer_params_weighted({model: self.l2_reg}, l2) if self.batch_norm == "Y": model = batch_norm(model) if self.dropout_p != 0: model = DropoutLayer(model, p=self.dropout_p) return model, all_l1_regs, all_l2_regs
def build_cnn(input_var=None, batch_size=None): # Input layer, as usual: network = lasagne.layers.InputLayer(shape=(None, 32, 32, 32), input_var=input_var) repeatInput = Repeat(network, 40) network = lasagne.layers.ReshapeLayer(repeatInput, (-1, 1, 32, 32, 32)) network_transformed = AffineTransformation3DLayer(network, batch_size * 40) network = lasagne.layers.ReshapeLayer(network_transformed, (-1, 32, 32, 32)) network = Conv2DLayer( network, num_filters=5, filter_size=(1, 1), # nonlinearity=lasagne.nonlinearities.sigmoid, nonlinearity=lasagne.nonlinearities.identity, W=lasagne.init.GlorotUniform()) network = lasagne.layers.BatchNormLayer(network) network = lasagne.layers.NonlinearityLayer(network, nonlinearity=lasagne.nonlinearities.rectify) network = Conv2DLayer( network, num_filters=5, filter_size=(1, 1), nonlinearity=lasagne.nonlinearities.identity, W=lasagne.init.GlorotUniform()) network = lasagne.layers.NonlinearityLayer(network, nonlinearity=lasagne.nonlinearities.rectify) network = lasagne.layers.BatchNormLayer(network) network = Conv2DLayer( network, num_filters=1, filter_size=(1, 1), nonlinearity=lasagne.nonlinearities.identity, W=lasagne.init.GlorotUniform()) network = lasagne.layers.BatchNormLayer(network) network = lasagne.layers.NonlinearityLayer(network, nonlinearity=lasagne.nonlinearities.rectify) network = NIN_block(network, 5, (128, 96, 96)) network = MaxPool2DLayer(network, pool_size=(2, 2), stride=(2, 2)) network = lasagne.layers.dropout(network, 0.5) network = NIN_block(network, 5, (128, 96, 96)) network = MaxPool2DLayer(network, pool_size=(2, 2), stride=(2, 2)) network = lasagne.layers.dropout(network, 0.5) network = NIN_block(network, 3, (128, 128, 40)) network = MaxPool2DLayer(network, pool_size=(8, 8), stride=(1, 1)) fc2 = lasagne.layers.DenseLayer( network, num_units=40, nonlinearity=lasagne.nonlinearities.identity) fc2_selected = SelectLayer(fc2, 40) weight_decay_layers = {network: 0.0, fc2: 0.002} l2_penalty = regularize_layer_params_weighted(weight_decay_layers, l2) return fc2, fc2_selected, l2_penalty, network_transformed
def build_cnn(input_var=None, batch_size=None): # Input layer, as usual: network = lasagne.layers.InputLayer(shape=(None, 1, 40, 40, 40), input_var=input_var) network = lasagne.layers.BatchNormLayer(network) repeatInput = Repeat(network, 10) network = lasagne.layers.ReshapeLayer(repeatInput, (-1, 1, 40, 40, 40)) network_transformed = AffineTransformation3DLayer(network, batch_size * 10) network_transformed_average = lasagne.layers.ExpressionLayer( network_transformed, lambda X: X.max(-1), output_shape='auto') network = Conv2DLayer( network_transformed_average, num_filters=32, filter_size=(5, 5), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform()) # Max-pooling layer of factor 2 in both dimensions: network = MaxPool2DLayer(network, pool_size=(2, 2)) # Another convolution with 32 5x5 kernels, and another 2x2 pooling: network = Conv2DLayer( network, num_filters=32, filter_size=(5, 5), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform() # nonlinearity=lasagne.nonlinearities.sigmoid ) network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) # A fully-connected layer of 256 units with 50% dropout on its inputs: fc1 = lasagne.layers.DenseLayer( lasagne.layers.dropout(network, p=.5), # network, num_units=256, # nonlinearity=lasagne.nonlinearities.sigmoid nonlinearity=lasagne.nonlinearities.rectify, ) # And, finally, the 10-unit output layer with 50% dropout on its inputs: fc2 = lasagne.layers.DenseLayer( lasagne.layers.dropout(fc1, p=.5), nonlinearity=lasagne.nonlinearities.identity, num_units=10, ) network_transformed = lasagne.layers.ReshapeLayer( network_transformed_average, (-1, 10, 40, 40)) fc2_selected = SelectLayer(fc2, 10) weight_decay_layers = {fc1: 0.0, fc2: 0.002} l2_penalty = regularize_layer_params_weighted(weight_decay_layers, l2) return fc2, fc2_selected, l2_penalty, network_transformed
def add_dense_layers(network, n_layers, n_units): regularization = 0 for i in range(0, n_layers): network = lasagne.layers.DenseLayer( incoming=network, num_units=n_units, nonlinearity=lasagne.nonlinearities.leaky_rectify) l2_penalty = regularize_layer_params_weighted({network: 0.2}, l2) regularization += l2_penalty return network, regularization
def build_network_single_lstm( args, input1_var, input1_mask_var, input2_var, intut2_mask_var, wordEmbeddings, maxlen=36, reg=0.5 * 1e-4 ): print ("Building model with single lstm") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] GRAD_CLIP = wordDim input_1 = InputLayer((None, maxlen), input_var=input1_var) batchsize, seqlen = input_1.input_var.shape input_1_mask = InputLayer((None, maxlen), input_var=input1_mask_var) emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_1.params[emb_1.W].remove("trainable") lstm_1 = LSTMLayer( emb_1, num_units=args.lstmDim, mask_input=input_1_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh ) slice_1 = SliceLayer(lstm_1, indices=-1, axis=1) # out_shape (None, args.lstmDim) input_2 = InputLayer((None, maxlen), input_var=input2_var) input_2_mask = InputLayer((None, maxlen), input_var=input2_mask_var) emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_2.params[emb_2.W].remove("trainable") lstm_2 = LSTMLayer( emb_2, num_units=args.lstmDim, mask_input=input_2_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh ) slice_2 = SliceLayer(lstm_2, indices=-1, axis=1) mul = ElemwiseMergeLayer([slice_1, slice_2], merge_function=T.mul) sub = AbsSubLayer([slice_1, slice_2], merge_function=T.sub) concat = ConcatLayer([mul, sub]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) if args.task == "sts": network = DenseLayer(hid, num_units=5, nonlinearity=logsoftmax) elif args.task == "ent": network = DenseLayer(hid, num_units=3, nonlinearity=logsoftmax) layers = {lstm_1: reg, hid: reg, network: reg} penalty = regularize_layer_params_weighted(layers, l2) input_dict = { input_1: input1_var, input_2: input2_var, input_1_mask: input1_mask_var, input_2_mask: input2_mask_var, } return network, penalty, input_dict
def build_cnn(input_var=None): # Input layer, as usual: network = lasagne.layers.InputLayer(shape=(None, 1, 40, 40), input_var=input_var) # This time we do not apply input dropout, as it tends to work less well # for convolutional layers. # Convolutional layer with 32 kernels of size 5x5. Strided and padded # convolutions are supported as well; see the docstring. network = Conv2DLayer( network, num_filters=32, filter_size=(5, 5), #nonlinearity=lasagne.nonlinearities.sigmoid, nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform()) # Expert note: Lasagne provides alternative convolutional layers that # override Theano's choice of which implementation to use; for details # please see http://lasagne.readthedocs.org/en/latest/user/tutorial.html. # Max-pooling layer of factor 2 in both dimensions: network = MaxPool2DLayer(network, pool_size=(2, 2)) # Another convolution with 32 5x5 kernels, and another 2x2 pooling: network = Conv2DLayer( network, num_filters=32, filter_size=(5, 5), nonlinearity=lasagne.nonlinearities.rectify, W = lasagne.init.GlorotUniform() #nonlinearity=lasagne.nonlinearities.sigmoid ) network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) # A fully-connected layer of 256 units with 50% dropout on its inputs: fc1 = lasagne.layers.DenseLayer( lasagne.layers.dropout(network, p=.5), #network, num_units=256, #nonlinearity=lasagne.nonlinearities.sigmoid nonlinearity=lasagne.nonlinearities.rectify, ) # And, finally, the 10-unit output layer with 50% dropout on its inputs: fc2 = lasagne.layers.DenseLayer( lasagne.layers.dropout(fc1, p=.5), #network, num_units=10, #nonlinearity=lasagne.nonlinearities.softmax nonlinearity=lasagne.nonlinearities.sigmoid ) weight_decay_layers = {fc1:0.0, fc2:0.002} l2_penalty = regularize_layer_params_weighted(weight_decay_layers, l2) return fc2, l2_penalty
def test_regularize_layer_params_weighted(self, layers): from lasagne.regularization import regularize_layer_params_weighted from lasagne.regularization import apply_penalty, l2 l_1, l_2, l_3 = layers layers = OrderedDict() layers[l_2] = 0.1 layers[l_3] = 0.5 loss = regularize_layer_params_weighted(layers, lasagne.regularization.l2) assert equal_computations([loss], [sum([0.1 * apply_penalty([l_2.W], l2), 0.5 * apply_penalty([l_3.W], l2)])])
def test_regularize_layer_params_weighted(self, layers): from lasagne.regularization import regularize_layer_params_weighted from lasagne.regularization import apply_penalty, l2 l_1, l_2, l_3 = layers layers = OrderedDict() layers[l_2] = 0.1 layers[l_3] = 0.5 loss = regularize_layer_params_weighted(layers, lasagne.regularization.l2) assert equal_computations([loss], [ sum([ 0.1 * apply_penalty([l_2.W], l2), 0.5 * apply_penalty([l_3.W], l2) ]) ])
def initNetwork(X, Y, config): alexNetModel = alexNet2(config, X) #network = lasagne.layers.FlattenLayer(alexNetModel.outLayer) network = lasagne.layers.DropoutLayer(alexNetModel.outLayer, p=config['prob_drop'], rescale=False) #dropout wtFileName = config['weightsDir'] + 'W_5.npy'; bFileName = config['weightsDir'] + 'b_5.npy' network = lasagne.layers.DenseLayer(network, num_units=31, W=getClassifierParam(wtFileName, False), b=getClassifierParam(bFileName, True), nonlinearity=lasagne.nonlinearities.softmax) #if classifier weights are not present, init with random weights regMult = [float(i) for i in config['regularize'].split(',')] #read off a line like :regularize: 0.1,0.1,0.1,0.1,0.1,0.1 from the config.yaml file layersRegMultiplier = {alexNetModel.layers[layerId]:regMult[layerId] for layerId in range(len(alexNetModel.layers))} layersRegMultiplier[network] = regMult[-1] l2_penalty = regularize_layer_params_weighted(layersRegMultiplier, l2) prediction = lasagne.layers.get_output(network, deterministic=True) lossAll = lasagne.objectives.categorical_crossentropy(prediction, Y) #loss function loss = lossAll.mean() loss = loss + l2_penalty accuracy = T.mean(T.eq(T.argmax(prediction, axis=1), Y), dtype=theano.config.floatX) match = T.eq(T.argmax(prediction, axis=1), Y) params = lasagne.layers.get_all_params(network, trainable=True) return [loss, params, accuracy, match]
def l2_network(input, n_outputs, last_nonlinearity): """ l2_network is a shallow network with L2-norm regularization on all ConvLayers and DenseLayers (corresponds to a gaussian prior assumption on all weights). Usage:: >>> import theano.tensor as T >>> from lasagne.layers import InputLayer >>> from lasagne.nonlinearities import sigmoid >>> >>> inputs = T.tensor4("inputs") >>> input_layer = InputLayer(input_var=inputs, shape=(None, 2, 32, 32, 32)) >>> n_classes = 2 >>> # apply the network >>> output_layer, l2_terms = dense_network(input_layer, n_classes, sigmoid) :param input: a lasagne layer, on top of which the network is applied :param n_outputs: number of output units in the last layer :param last_nonlinearity: what the non-linearity in the last layer should be :return: the last lasagne layer of the network, and L2 regularization terms if there are any (otherwise 0). """ regularization = 0 network = input # add deep convolutional structure network, penalty = add_shallow_conv_maxpool(network) regularization += penalty # add deep dense fully connected layers network, penalty = add_dense_layers(network, n_layers=1, n_units=256) regularization += penalty # add the output layer non-linearity network = lasagne.layers.DenseLayer(incoming=network, num_units=n_outputs, nonlinearity=last_nonlinearity) l2_penalty = regularize_layer_params_weighted({network: 0.2}, l2) regularization += l2_penalty return network, regularization
def initNetwork(X, Y, config): alexNetModel = alexNet2(config, X) #network = lasagne.layers.FlattenLayer(alexNetModel.outLayer) network = lasagne.layers.DropoutLayer(alexNetModel.outLayer, p=config['prob_drop'], rescale=False) #dropout wtFileName = config['weightsDir'] + 'W_5.npy' bFileName = config['weightsDir'] + 'b_5.npy' network = lasagne.layers.DenseLayer( network, num_units=31, W=getClassifierParam(wtFileName, False), b=getClassifierParam(bFileName, True), nonlinearity=lasagne.nonlinearities.softmax ) #if classifier weights are not present, init with random weights regMult = [ float(i) for i in config['regularize'].split(',') ] #read off a line like :regularize: 0.1,0.1,0.1,0.1,0.1,0.1 from the config.yaml file layersRegMultiplier = { alexNetModel.layers[layerId]: regMult[layerId] for layerId in range(len(alexNetModel.layers)) } layersRegMultiplier[network] = regMult[-1] l2_penalty = regularize_layer_params_weighted(layersRegMultiplier, l2) prediction = lasagne.layers.get_output(network, deterministic=True) lossAll = lasagne.objectives.categorical_crossentropy(prediction, Y) #loss function loss = lossAll.mean() loss = loss + l2_penalty accuracy = T.mean(T.eq(T.argmax(prediction, axis=1), Y), dtype=theano.config.floatX) match = T.eq(T.argmax(prediction, axis=1), Y) params = lasagne.layers.get_all_params(network, trainable=True) return [loss, params, accuracy, match]
def __init_model(self): """Initializes the model and compiles the network For the most part, this consists of setting up some bookkeeping for theano and lasagne, and compiling the theano functions """ logging.info('initializing model') if self.Xshape == None or self.yshape == None: if self.Xshape == None: logging.warning("Tried to compile Neural Net before" "setting input dimensionality") if self.yshape == None: logging.warning("Tried to compile Neural Net before" "setting output dimensionality") raise ShapeError(self.Xshape, self.yshape) # These are theano/lasagne symbolic variable declarationss, # representing... the target vector(traces) target_vector = T.fmatrix('y') # our predictions predictions = lasagne.layers.get_output(self.layer_out) validation_predictions = lasagne.layers.get_output(self.layer_out, deterministic=True) # the loss (diff in objective) for training # using MSE stochastic_loss = lasagne.objectives.squared_error( predictions, target_vector).mean() #print(stochastic_loss) deterministic_loss = lasagne.objectives.squared_error( validation_predictions, target_vector).mean() # using cross entropy #stochastic_loss = lasagne.objectives.categorical_crossentropy(predictions, target_vector).mean() # the loss for validation #deterministic_loss = lasagne.objectives.categorical_crossentropy(test_predictions, target_vector).mean() # calculate loss loss = stochastic_loss # should regularization be used? config = self.config if config: if config.l1_regularization: logging.info("Using L1 regularization") l1_penalty = regularize_layer_params(self.layer_out, l1) * 1e-4 loss += l1_penalty if config.l2_regularization: logging.info("Using L2 regularization with weights") for sublayer in self.layer_in: logging.info("\tinput layer ({1}) weight: {0}".format( self.layer_weights[sublayer], sublayer.name)) logging.info("\toutput layer weight: {0}".format( self.layer_weights[self.layer_out])) l2_penalty = regularize_layer_params_weighted( self.layer_weights, l2) loss += l2_penalty else: logging.info("No regularization") # the network parameters (i.e. weights) all_params = lasagne.layers.get_all_params(self.layer_out) # how to update the weights updates = lasagne.updates.nesterov_momentum(loss_or_grads=loss, params=all_params, learning_rate=0.1, momentum=0.9) # The theano functions for training, validating, and tracing. # These get method-level wrappers below logging.info('compiling theano functions') self._train_fn = theano.function( on_unused_input='warn', inputs=[l.input_var for l in self.layer_in] + [target_vector], outputs=[stochastic_loss], updates=updates) self._valid_fn = theano.function( on_unused_input='warn', inputs=[l.input_var for l in self.layer_in] + [target_vector], outputs=[deterministic_loss, validation_predictions]) self._trace_fn = theano.function( on_unused_input='warn', inputs=[l.input_var for l in self.layer_in], outputs=[ validation_predictions * self.roi.shape[0] + self.roi.offset[0] ])
def multi_task_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats, lambda_val = 0.5 * 1e-4): print("Building multi task model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen-kw+1 stride = 1 filter_size=wordDim pool_size=num_filters input = InputLayer((None, seqlen, num_feats),input_var=input_var) batchsize, _, _ = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim)) conv1d_1 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_1 = MaxPool1DLayer(conv1d_1, pool_size=pool_size) hid_1 = DenseLayer(maxpool_1, num_units=args.hiddenDim, nonlinearity=sigmoid) network_1 = DenseLayer(hid_1, num_units=2, nonlinearity=softmax) conv1d_2 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_2 = MaxPool1DLayer(conv1d_2, pool_size=pool_size) hid_2 = DenseLayer(maxpool_2, num_units=args.hiddenDim, nonlinearity=sigmoid) network_2 = DenseLayer(hid_2, num_units=4, nonlinearity=softmax) conv1d_3 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_3 = MaxPool1DLayer(conv1d_3, pool_size=pool_size) hid_3 = DenseLayer(maxpool_3, num_units=args.hiddenDim, nonlinearity=sigmoid) network_3 = DenseLayer(hid_3, num_units=3, nonlinearity=softmax) conv1d_4 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_4 = MaxPool1DLayer(conv1d_4, pool_size=pool_size) hid_4 = DenseLayer(maxpool_4, num_units=args.hiddenDim, nonlinearity=sigmoid) network_4 = DenseLayer(hid_4, num_units=3, nonlinearity=softmax) conv1d_5 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_5 = MaxPool1DLayer(conv1d_5, pool_size=pool_size) hid_5 = DenseLayer(maxpool_5, num_units=args.hiddenDim, nonlinearity=sigmoid) network_5 = DenseLayer(hid_5, num_units=2, nonlinearity=softmax) conv1d_6 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_6 = MaxPool1DLayer(conv1d_6, pool_size=pool_size) hid_6 = DenseLayer(maxpool_6, num_units=args.hiddenDim, nonlinearity=sigmoid) network_6 = DenseLayer(hid_6, num_units=4, nonlinearity=softmax) conv1d_7 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_7 = MaxPool1DLayer(conv1d_7, pool_size=pool_size) hid_7 = DenseLayer(maxpool_7, num_units=args.hiddenDim, nonlinearity=sigmoid) network_7 = DenseLayer(hid_7, num_units=3, nonlinearity=softmax) conv1d_8 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_8 = MaxPool1DLayer(conv1d_8, pool_size=pool_size) hid_8 = DenseLayer(maxpool_8, num_units=args.hiddenDim, nonlinearity=sigmoid) network_8 = DenseLayer(hid_8, num_units=3, nonlinearity=softmax) # Is this important? network_1_out, network_2_out, network_3_out, network_4_out, \ network_5_out, network_6_out, network_7_out, network_8_out = \ get_output([network_1, network_2, network_3, network_4, network_5, network_6, network_7, network_8]) loss_1 = T.mean(binary_crossentropy(network_1_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_1:lambda_val, hid_1:lambda_val, network_1:lambda_val} , l2) updates_1 = adagrad(loss_1, get_all_params(network_1, trainable=True), learning_rate=args.step) train_fn_1 = theano.function([input_var, target_var], loss_1, updates=updates_1, allow_input_downcast=True) val_acc_1 = T.mean(binary_accuracy(get_output(network_1, deterministic=True), target_var)) val_fn_1 = theano.function([input_var, target_var], val_acc_1, allow_input_downcast=True) loss_2 = T.mean(categorical_crossentropy(network_2_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_2:lambda_val, hid_2:lambda_val, network_2:lambda_val} , l2) updates_2 = adagrad(loss_2, get_all_params(network_2, trainable=True), learning_rate=args.step) train_fn_2 = theano.function([input_var, target_var], loss_2, updates=updates_2, allow_input_downcast=True) val_acc_2 = T.mean(categorical_accuracy(get_output(network_2, deterministic=True), target_var)) val_fn_2 = theano.function([input_var, target_var], val_acc_2, allow_input_downcast=True) loss_3 = T.mean(categorical_crossentropy(network_3_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_3:lambda_val, hid_3:lambda_val, network_3:lambda_val} , l2) updates_3 = adagrad(loss_3, get_all_params(network_3, trainable=True), learning_rate=args.step) train_fn_3 = theano.function([input_var, target_var], loss_3, updates=updates_3, allow_input_downcast=True) val_acc_3 = T.mean(categorical_accuracy(get_output(network_3, deterministic=True), target_var)) val_fn_3 = theano.function([input_var, target_var], val_acc_3, allow_input_downcast=True) loss_4 = T.mean(categorical_crossentropy(network_4_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_4:lambda_val, hid_4:lambda_val, network_4:lambda_val} , l2) updates_4 = adagrad(loss_4, get_all_params(network_4, trainable=True), learning_rate=args.step) train_fn_4 = theano.function([input_var, target_var], loss_4, updates=updates_4, allow_input_downcast=True) val_acc_4 = T.mean(categorical_accuracy(get_output(network_4, deterministic=True), target_var)) val_fn_4 = theano.function([input_var, target_var], val_acc_4, allow_input_downcast=True) loss_5 = T.mean(binary_crossentropy(network_5_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_5:lambda_val, hid_5:lambda_val, network_5:lambda_val} , l2) updates_5 = adagrad(loss_5, get_all_params(network_5, trainable=True), learning_rate=args.step) train_fn_5 = theano.function([input_var, target_var], loss_5, updates=updates_5, allow_input_downcast=True) val_acc_5 = T.mean(binary_accuracy(get_output(network_5, deterministic=True), target_var)) val_fn_5 = theano.function([input_var, target_var], val_acc_5, allow_input_downcast=True) loss_6 = T.mean(categorical_crossentropy(network_6_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_6:lambda_val, hid_6:lambda_val, network_6:lambda_val} , l2) updates_6 = adagrad(loss_6, get_all_params(network_6, trainable=True), learning_rate=args.step) train_fn_6 = theano.function([input_var, target_var], loss_6, updates=updates_6, allow_input_downcast=True) val_acc_6 = T.mean(categorical_accuracy(get_output(network_6, deterministic=True), target_var)) val_fn_6 = theano.function([input_var, target_var], val_acc_6, allow_input_downcast=True) loss_7 = T.mean(categorical_crossentropy(network_7_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_7:lambda_val, hid_7:lambda_val, network_7:lambda_val} , l2) updates_7 = adagrad(loss_7, get_all_params(network_7, trainable=True), learning_rate=args.step) train_fn_7 = theano.function([input_var, target_var], loss_7, updates=updates_7, allow_input_downcast=True) val_acc_7 = T.mean(categorical_accuracy(get_output(network_7, deterministic=True), target_var)) val_fn_7 = theano.function([input_var, target_var], val_acc_7, allow_input_downcast=True) loss_8 = T.mean(categorical_crossentropy(network_8_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_8:lambda_val, hid_8:lambda_val, network_8:lambda_val} , l2) updates_8 = adagrad(loss_8, get_all_params(network_8, trainable=True), learning_rate=args.step) train_fn_8 = theano.function([input_var, target_var], loss_8, updates=updates_8, allow_input_downcast=True) val_acc_8 = T.mean(categorical_accuracy(get_output(network_8, deterministic=True), target_var)) val_fn_8 = theano.function([input_var, target_var], val_acc_8, allow_input_downcast=True) return train_fn_1, val_fn_1, network_1, train_fn_2, val_fn_2, network_2, train_fn_3, val_fn_3, \ network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \ train_fn_6, val_fn_6, network_6, train_fn_7, val_fn_7, network_7, train_fn_8, val_fn_8, network_8
def build_simple_network(self): """ Builds a very, very simple non-memory network to assess the effectiveness of training on the task. - Input: (batch_size, max_seqlen, max_sentlen) - Wordwise embedding into (batch_size, max_seqlen, max_sentlen, embed_size) - Sum all words in a sentence: (batch_size, max_seqlen, embed_size) - Reshape embedding into (batch_size, max_seqlen * embed_size) - 3 hidden layers with sigmoid, hidden dim (512, 512, 256) """ batch_size, max_seqlen, max_sentlen, embedding_size, vocab = self.batch_size, self.max_seqlen, self.max_sentlen, self.embedding_size, self.vocab self.hidden_size = 256 c = T.imatrix() y = T.imatrix() self.c_shared = theano.shared(np.zeros((batch_size, max_seqlen), dtype=np.int32), borrow=True) self.a_shared = theano.shared(np.zeros((batch_size, self.num_classes), dtype=np.int32), borrow=True) S_shared = theano.shared(self.S, borrow=True) cc = S_shared[c.flatten()].reshape( (batch_size, max_seqlen, max_sentlen)) l_context_in = lasagne.layers.InputLayer(shape=(batch_size, max_seqlen, max_sentlen)) L = self.build_glove_embedding(root_dir + "/data/glove/glove.6B.100d.txt", hidden_size=embedding_size) print L embedding = lasagne.layers.EmbeddingLayer(l_context_in, len(vocab) + 1, embedding_size, W=L) sum_embeddings = ScaleSumLayer(embedding, axis=2) reshape_sum = lasagne.layers.ReshapeLayer( sum_embeddings, shape=(batch_size, max_seqlen * embedding_size)) # Fully connected layers dense_1 = lasagne.layers.DenseLayer(reshape_sum, self.hidden_size, W=lasagne.init.GlorotNormal(), nonlinearity=T.nnet.sigmoid) dense_2 = lasagne.layers.DenseLayer(dense_1, self.hidden_size, W=lasagne.init.GlorotNormal(), nonlinearity=T.nnet.sigmoid) l_pred = lasagne.layers.DenseLayer( dense_2, self.num_classes, nonlinearity=lasagne.nonlinearities.softmax) rand_in = np.random.randint(0, len(vocab) - 1, size=(batch_size, max_seqlen, max_sentlen)) fake_probs = lasagne.layers.get_output(l_pred, { l_context_in: rand_in }).eval() print "fake_probs: ", fake_probs probas = lasagne.layers.helper.get_output(l_pred, {l_context_in: cc}) pred = T.argmax(probas, axis=1) # l2 regularization reg_coeff = 1e-1 p_metric = l2 layer_dict = { dense_1: reg_coeff, dense_2: reg_coeff, l_pred: reg_coeff } reg_cost = reg_coeff * regularize_layer_params_weighted( layer_dict, p_metric) cost = T.nnet.categorical_crossentropy(probas, y).mean() #+ reg_cost params = lasagne.layers.helper.get_all_params(l_pred, trainable=True) grads = T.grad(cost, params) scaled_grads = lasagne.updates.total_norm_constraint( grads, self.max_norm) updates = lasagne.updates.adam(scaled_grads, params, learning_rate=self.lr) givens = { c: self.c_shared, y: self.a_shared, } self.train_model = theano.function([], cost, givens=givens, updates=updates, on_unused_input='ignore') self.compute_pred = theano.function([], pred, givens=givens, on_unused_input='ignore') zero_vec_tensor = T.vector() self.zero_vec = np.zeros(embedding_size, dtype=theano.config.floatX) self.set_zero = theano.function([zero_vec_tensor], on_unused_input='ignore') #self.nonlinearity = nonlinearity self.network = l_pred
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats): print("Building model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen-kw+1 stride = 1 #important context words as channels #CNN_sentence config filter_size=wordDim pool_size=seqlen-filter_size+1 input = InputLayer((None, seqlen, num_feats),input_var=input_var) batchsize, _, _ = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim) #print get_output_shape(emb) reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim)) #print get_output_shape(reshape) conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1 #print get_output_shape(conv1d) conv1d = DimshuffleLayer(conv1d, (0,2,1)) #print get_output_shape(conv1d) pool_size=num_filters maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) #print get_output_shape(maxpool) #forward = FlattenLayer(maxpool) #print get_output_shape(forward) hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
def build_cnn(input_var=None): # Input layer, as usual: network = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input_var) #norm0 = BatchNormLayer(network) # conv1 conv1 = Conv2DLayer(network, num_filters=64, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.0), name="conv1") conv1a = Conv2DLayer(conv1, num_filters=64, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.0), name="conv1a") pool1 = MaxPool2DLayer(conv1a, pool_size=(2, 2), stride=(2, 2), pad=0) #norm1 = BatchNormLayer(pool1) # pool1 # conv2 conv2 = Conv2DLayer(lasagne.layers.dropout(pool1, p = 0.5), num_filters=128, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.1), name='conv2') conv2a = Conv2DLayer(conv2, num_filters=128, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.1), name='conv2a') pool2 = MaxPool2DLayer(conv2a, pool_size=(2, 2), stride=(2, 2), pad=0) # norm2 #norm2 = BatchNormLayer(pool2) # pool2 conv3 = Conv2DLayer(lasagne.layers.dropout(pool2, p = 0.5), num_filters=256, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.1), name='conv3') pool3 = MaxPool2DLayer(conv3, pool_size=(2, 2), stride=(2, 2), pad=0) #norm3 = BatchNormLayer(pool3) # fc1 fc1 = DenseLayer(lasagne.layers.dropout(pool3, p = 0.5), num_units=256, nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.1), name="fc1") # fc3 softmax_layer = DenseLayer(lasagne.layers.dropout(fc1, p = 0.5), num_units=9, nonlinearity=lasagne.nonlinearities.softmax, W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.0), name="softmax") intermediate_layer = pool2 weight_decay_layers = {fc1: 0.0} l2_penalty = regularize_layer_params_weighted(weight_decay_layers, l2) return softmax_layer, l2_penalty
def multi_task_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats, lambda_val=0.5 * 1e-4): print("Building multi task model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen - kw + 1 stride = 1 filter_size = wordDim pool_size = num_filters input = InputLayer((None, seqlen, num_feats), input_var=input_var) batchsize, _, _ = input.input_var.shape #span emb1 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape1 = ReshapeLayer(emb1, (batchsize, seqlen, num_feats * wordDim)) conv1d_1 = DimshuffleLayer( Conv1DLayer(reshape1, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_1 = MaxPool1DLayer(conv1d_1, pool_size=pool_size) hid_1 = DenseLayer(maxpool_1, num_units=args.hiddenDim, nonlinearity=sigmoid) network_1 = DenseLayer(hid_1, num_units=2, nonlinearity=softmax) """ #DocTimeRel emb2 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape2 = ReshapeLayer(emb2, (batchsize, seqlen, num_feats*wordDim)) conv1d_2 = DimshuffleLayer(Conv1DLayer(reshape2, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_2 = MaxPool1DLayer(conv1d_2, pool_size=pool_size) hid_2 = DenseLayer(maxpool_2, num_units=args.hiddenDim, nonlinearity=sigmoid) network_2 = DenseLayer(hid_2, num_units=5, nonlinearity=softmax) """ #Type emb3 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape3 = ReshapeLayer(emb3, (batchsize, seqlen, num_feats * wordDim)) conv1d_3 = DimshuffleLayer( Conv1DLayer(reshape3, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_3 = MaxPool1DLayer(conv1d_3, pool_size=pool_size) hid_3 = DenseLayer(maxpool_3, num_units=args.hiddenDim, nonlinearity=sigmoid) network_3 = DenseLayer(hid_3, num_units=4, nonlinearity=softmax) #Degree emb4 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape4 = ReshapeLayer(emb4, (batchsize, seqlen, num_feats * wordDim)) conv1d_4 = DimshuffleLayer( Conv1DLayer(reshape4, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_4 = MaxPool1DLayer(conv1d_4, pool_size=pool_size) hid_4 = DenseLayer(maxpool_4, num_units=args.hiddenDim, nonlinearity=sigmoid) network_4 = DenseLayer(hid_4, num_units=4, nonlinearity=softmax) #Polarity emb5 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape5 = ReshapeLayer(emb5, (batchsize, seqlen, num_feats * wordDim)) conv1d_5 = DimshuffleLayer( Conv1DLayer(reshape5, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_5 = MaxPool1DLayer(conv1d_5, pool_size=pool_size) hid_5 = DenseLayer(maxpool_5, num_units=args.hiddenDim, nonlinearity=sigmoid) network_5 = DenseLayer(hid_5, num_units=3, nonlinearity=softmax) #ContextualModality emb6 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape6 = ReshapeLayer(emb6, (batchsize, seqlen, num_feats * wordDim)) conv1d_6 = DimshuffleLayer( Conv1DLayer(reshape6, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_6 = MaxPool1DLayer(conv1d_6, pool_size=pool_size) hid_6 = DenseLayer(maxpool_6, num_units=args.hiddenDim, nonlinearity=sigmoid) network_6 = DenseLayer(hid_6, num_units=5, nonlinearity=softmax) """ #ContextualAspect emb7 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape7 = ReshapeLayer(emb7, (batchsize, seqlen, num_feats*wordDim)) conv1d_7 = DimshuffleLayer(Conv1DLayer(reshape7, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_7 = MaxPool1DLayer(conv1d_7, pool_size=pool_size) hid_7 = DenseLayer(maxpool_7, num_units=args.hiddenDim, nonlinearity=sigmoid) network_7 = DenseLayer(hid_7, num_units=4, nonlinearity=softmax) """ """ #Permanence emb8 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape8 = ReshapeLayer(emb8, (batchsize, seqlen, num_feats*wordDim)) conv1d_8 = DimshuffleLayer(Conv1DLayer(reshape8, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_8 = MaxPool1DLayer(conv1d_8, pool_size=pool_size) hid_8 = DenseLayer(maxpool_8, num_units=args.hiddenDim, nonlinearity=sigmoid) network_8 = DenseLayer(hid_8, num_units=4, nonlinearity=softmax) """ # Is this important? """ network_1_out, network_2_out, network_3_out, network_4_out, \ network_5_out, network_6_out, network_7_out, network_8_out = \ get_output([network_1, network_2, network_3, network_4, network_5, network_6, network_7, network_8]) """ network_1_out = get_output(network_1) network_3_out = get_output(network_3) network_4_out = get_output(network_4) network_5_out = get_output(network_5) network_6_out = get_output(network_6) loss_1 = T.mean(binary_crossentropy( network_1_out, target_var)) + regularize_layer_params_weighted( { emb1: lambda_val, conv1d_1: lambda_val, hid_1: lambda_val, network_1: lambda_val }, l2) updates_1 = adagrad(loss_1, get_all_params(network_1, trainable=True), learning_rate=args.step) train_fn_1 = theano.function([input_var, target_var], loss_1, updates=updates_1, allow_input_downcast=True) val_acc_1 = T.mean( binary_accuracy(get_output(network_1, deterministic=True), target_var)) val_fn_1 = theano.function([input_var, target_var], val_acc_1, allow_input_downcast=True) """ loss_2 = T.mean(categorical_crossentropy(network_2_out,target_var)) + regularize_layer_params_weighted({emb2:lambda_val, conv1d_2:lambda_val, hid_2:lambda_val, network_2:lambda_val} , l2) updates_2 = adagrad(loss_2, get_all_params(network_2, trainable=True), learning_rate=args.step) train_fn_2 = theano.function([input_var, target_var], loss_2, updates=updates_2, allow_input_downcast=True) val_acc_2 = T.mean(categorical_accuracy(get_output(network_2, deterministic=True), target_var)) val_fn_2 = theano.function([input_var, target_var], val_acc_2, allow_input_downcast=True) """ loss_3 = T.mean(categorical_crossentropy( network_3_out, target_var)) + regularize_layer_params_weighted( { emb3: lambda_val, conv1d_3: lambda_val, hid_3: lambda_val, network_3: lambda_val }, l2) updates_3 = adagrad(loss_3, get_all_params(network_3, trainable=True), learning_rate=args.step) train_fn_3 = theano.function([input_var, target_var], loss_3, updates=updates_3, allow_input_downcast=True) val_acc_3 = T.mean( categorical_accuracy(get_output(network_3, deterministic=True), target_var)) val_fn_3 = theano.function([input_var, target_var], val_acc_3, allow_input_downcast=True) loss_4 = T.mean(categorical_crossentropy( network_4_out, target_var)) + regularize_layer_params_weighted( { emb4: lambda_val, conv1d_4: lambda_val, hid_4: lambda_val, network_4: lambda_val }, l2) updates_4 = adagrad(loss_4, get_all_params(network_4, trainable=True), learning_rate=args.step) train_fn_4 = theano.function([input_var, target_var], loss_4, updates=updates_4, allow_input_downcast=True) val_acc_4 = T.mean( categorical_accuracy(get_output(network_4, deterministic=True), target_var)) val_fn_4 = theano.function([input_var, target_var], val_acc_4, allow_input_downcast=True) loss_5 = T.mean(categorical_crossentropy( network_5_out, target_var)) + regularize_layer_params_weighted( { emb5: lambda_val, conv1d_5: lambda_val, hid_5: lambda_val, network_5: lambda_val }, l2) updates_5 = adagrad(loss_5, get_all_params(network_5, trainable=True), learning_rate=args.step) train_fn_5 = theano.function([input_var, target_var], loss_5, updates=updates_5, allow_input_downcast=True) val_acc_5 = T.mean( categorical_accuracy(get_output(network_5, deterministic=True), target_var)) val_fn_5 = theano.function([input_var, target_var], val_acc_5, allow_input_downcast=True) loss_6 = T.mean(categorical_crossentropy( network_6_out, target_var)) + regularize_layer_params_weighted( { emb6: lambda_val, conv1d_6: lambda_val, hid_6: lambda_val, network_6: lambda_val }, l2) updates_6 = adagrad(loss_6, get_all_params(network_6, trainable=True), learning_rate=args.step) train_fn_6 = theano.function([input_var, target_var], loss_6, updates=updates_6, allow_input_downcast=True) val_acc_6 = T.mean( categorical_accuracy(get_output(network_6, deterministic=True), target_var)) val_fn_6 = theano.function([input_var, target_var], val_acc_6, allow_input_downcast=True) """ loss_7 = T.mean(categorical_crossentropy(network_7_out,target_var)) + regularize_layer_params_weighted({emb7:lambda_val, conv1d_7:lambda_val, hid_7:lambda_val, network_7:lambda_val} , l2) updates_7 = adagrad(loss_7, get_all_params(network_7, trainable=True), learning_rate=args.step) train_fn_7 = theano.function([input_var, target_var], loss_7, updates=updates_7, allow_input_downcast=True) val_acc_7 = T.mean(categorical_accuracy(get_output(network_7, deterministic=True), target_var)) val_fn_7 = theano.function([input_var, target_var], val_acc_7, allow_input_downcast=True) loss_8 = T.mean(categorical_crossentropy(network_8_out,target_var)) + regularize_layer_params_weighted({emb8:lambda_val, conv1d_8:lambda_val, hid_8:lambda_val, network_8:lambda_val} , l2) updates_8 = adagrad(loss_8, get_all_params(network_8, trainable=True), learning_rate=args.step) train_fn_8 = theano.function([input_var, target_var], loss_8, updates=updates_8, allow_input_downcast=True) val_acc_8 = T.mean(categorical_accuracy(get_output(network_8, deterministic=True), target_var)) val_fn_8 = theano.function([input_var, target_var], val_acc_8, allow_input_downcast=True) """ """ return train_fn_1, val_fn_1, network_1, train_fn_2, val_fn_2, network_2, train_fn_3, val_fn_3, \ network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \ train_fn_6, val_fn_6, network_6, train_fn_7, val_fn_7, network_7, train_fn_8, val_fn_8, network_8 """ return train_fn_1, val_fn_1, network_1, train_fn_3, val_fn_3, \ network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \ train_fn_6, val_fn_6, network_6
def main(exp_name, embed_data, train_data, train_data_stats, val_data, val_data_stats, test_data, test_data_stats, log_path, batch_size, num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty, reg_coeff): """ Main run function for training model. :param exp_name: :param embed_data: :param train_data: :param train_data_stats: :param val_data: :param val_data_stats: :param test_data: :param test_data_stats: :param log_path: :param batch_size: :param num_epochs: :param unroll_steps: :param learn_rate: :param num_dense: Number of dense fully connected layers to add after concatenation layer :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1 :param penalty: Penalty to use for regularization :param reg_weight: Regularization coeff to use for each layer of network; may want to support different coefficient for different layers :return: """ # Set random seed for deterministic results np.random.seed(0) num_ex_to_train = 30 # Load embedding table table = EmbeddingTable(embed_data) vocab_size = table.sizeVocab dim_embeddings = table.dimEmbeddings embeddings_mat = table.embeddings train_prem, train_hyp = generate_data(train_data, train_data_stats, "left", "right", table, seq_len=unroll_steps) val_prem, val_hyp = generate_data(val_data, val_data_stats, "left", "right", table, seq_len=unroll_steps) train_labels = convertLabelsToMat(train_data) val_labels = convertLabelsToMat(val_data) # To test for overfitting capabilities of model if num_ex_to_train > 0: val_prem = val_prem[0:num_ex_to_train] val_hyp = val_hyp[0:num_ex_to_train] val_labels = val_labels[0:num_ex_to_train] # Theano expressions for premise/hypothesis inputs to network x_p = T.imatrix() x_h = T.imatrix() target_values = T.fmatrix(name="target_output") # Embedding layer for premise l_in_prem = InputLayer((batch_size, unroll_steps)) l_embed_prem = EmbeddingLayer(l_in_prem, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Embedding layer for hypothesis l_in_hyp = InputLayer((batch_size, unroll_steps)) l_embed_hyp = EmbeddingLayer(l_in_hyp, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Ensure embedding matrix parameters are not trainable l_embed_hyp.params[l_embed_hyp.W].remove('trainable') l_embed_prem.params[l_embed_prem.W].remove('trainable') l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp) l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem) # Concatenate sentence embeddings for premise and hypothesis l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum]) l_in = l_concat l_output = l_concat # Add 'num_dense' dense layers with tanh # top layer is softmax if num_dense > 1: for n in range(num_dense): if n == num_dense-1: l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) else: l_in = DenseLayer(l_in, num_units=dense_dim, nonlinearity=lasagne.nonlinearities.tanh) else: l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) network_output = get_output(l_output, {l_in_prem: x_p, l_in_hyp: x_h}) # Will have shape (batch_size, 3) f_dense_output = theano.function([x_p, x_h], network_output, on_unused_input='warn') # Compute cost if penalty == "l2": p_metric = l2 elif penalty == "l1": p_metric = l1 layers = lasagne.layers.get_all_layers(l_output) layer_dict = {l: reg_coeff for l in layers} reg_cost = reg_coeff * regularize_layer_params_weighted(layer_dict, p_metric) cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values).mean()) + reg_cost compute_cost = theano.function([x_p, x_h, target_values], cost) # Compute accuracy accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1), T.argmax(target_values, axis=-1)), dtype=theano.config.floatX) compute_accuracy = theano.function([x_p, x_h, target_values], accuracy) label_output = T.argmax(network_output, axis=-1) predict = theano.function([x_p, x_h], label_output) # Define update/train functions all_params = lasagne.layers.get_all_params(l_output, trainable=True) updates = lasagne.updates.rmsprop(cost, all_params, learn_rate) train = theano.function([x_p, x_h, target_values], cost, updates=updates) # TODO: Augment embedding layer to allow for masking inputs stats = Stats(exp_name) acc_num = 10 #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size) minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size) print("Training ...") try: total_num_ex = 0 for epoch in xrange(num_epochs): for _, minibatch in minibatches: total_num_ex += len(minibatch) stats.log("Processed {0} total examples in epoch {1}".format(str(total_num_ex), str(epoch))) #prem_batch = val_prem[minibatch] #hyp_batch = val_hyp[minibatch] #labels_batch = val_labels[minibatch] prem_batch = train_prem[minibatch] hyp_batch = train_hyp[minibatch] labels_batch = train_labels[minibatch] train(prem_batch, hyp_batch, labels_batch) cost_val = compute_cost(prem_batch, hyp_batch, labels_batch) stats.recordCost(total_num_ex, cost_val) # Periodically compute and log train/dev accuracy if total_num_ex%(acc_num*batch_size) == 0: train_acc = compute_accuracy(train_prem, train_hyp, train_labels) dev_acc = compute_accuracy(val_prem, val_hyp, val_labels) stats.recordAcc(total_num_ex, train_acc, dataset="train") stats.recordAcc(total_num_ex, dev_acc, dataset="dev") except KeyboardInterrupt: pass
def build_cnn(input_var=None, batch_size = None): """Build the CIFAR-10 model. Args: images: Images returned from distorted_inputs() or inputs(). Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU training runs. # If we only ran this model on a single GPU, we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). # input_layer = InputLayer((batch_size, 3, ORIGINAL_IMAGE_SIZE, ORIGINAL_IMAGE_SIZE), input_var=input_var) repeatInput = Repeat(input_layer, 10) reshapeInput = lasagne.layers.ReshapeLayer(repeatInput, (batch_size * 10, 3, ORIGINAL_IMAGE_SIZE, ORIGINAL_IMAGE_SIZE)) original_transformed = BrightnessAdjustLayer(reshapeInput, batch_size * 10) # norm0 = BatchNormLayer(original_transformed) # conv1 conv1 = Conv2DLayer(original_transformed, num_filters=64, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.0), name="conv1") conv1a = Conv2DLayer(conv1, num_filters=64, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.0), name="conv1a") pool1 = MaxPool2DLayer(conv1a, pool_size=(2, 2), stride=(2, 2), pad=0) # norm1 = LocalResponseNormalization2DLayer(pool1, alpha=0.001 / 9.0, # beta=0.75, k=1.0, n=9) norm1 = BatchNormLayer(pool1) # conv2 conv2 = Conv2DLayer(lasagne.layers.dropout(norm1, p = 0.5), num_filters=128, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.1), name='conv2') conv2a = Conv2DLayer(conv2, num_filters=128, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.1), name='conv2a') pool2 = MaxPool2DLayer(conv2a, pool_size=(2, 2), stride=(2, 2), pad=0) # norm2 = LocalResponseNormalization2DLayer(pool2, alpha=0.001 / 9.0, # beta=0.75, k=1.0, n=9) norm2 = BatchNormLayer(pool2) # pool2 conv3 = Conv2DLayer(lasagne.layers.dropout(norm2, p = 0.5), num_filters=256, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.1), name='conv3') pool3 = MaxPool2DLayer(conv3, pool_size=(2, 2), stride=(2, 2), pad=0) # norm3 = LocalResponseNormalization2DLayer(pool3, alpha=0.001 / 9.0, # beta=0.75, k=1.0, n=9) norm3 = BatchNormLayer(pool3) # fc1 fc1 = DenseLayer(lasagne.layers.dropout(norm3, p = 0.5), num_units=256, nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.1), name="fc1") # fc3 output_layer = DenseLayer(lasagne.layers.dropout(fc1, p = 0.5), num_units=10, #nonlinearity=lasagne.nonlinearities.softmax, nonlinearity=lasagne.nonlinearities.identity, W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.0), name="output") output_transformed = lasagne.layers.ReshapeLayer(original_transformed, (batch_size, 10, 3, ORIGINAL_IMAGE_SIZE, ORIGINAL_IMAGE_SIZE)) output_selected = SelectLayer(output_layer, 10) # Weight Decay weight_decay_layers = {original_transformed: 0.01} l2_penalty = regularize_layer_params_weighted(weight_decay_layers, l2) return output_layer, output_selected, l2_penalty, output_transformed
def build_network_lstm2dconv( args, input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var, wordEmbeddings, maxlen=36 ): print ("Building model lstm + 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] GRAD_CLIP = wordDim num_filters = 8 filter_size = (2, 9) stride = 1 pool_size = (1, 2) input_1 = InputLayer((None, maxlen), input_var=input1_var) batchsize, seqlen = input_1.input_var.shape input_1_mask = InputLayer((None, maxlen), input_var=input1_mask_var) emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_1.params[emb_1.W].remove("trainable") lstm_1 = LSTMLayer( emb_1, num_units=args.lstmDim, mask_input=input_1_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh ) lstm_1_back = LSTMLayer( emb_1, num_units=args.lstmDim, mask_input=input_1_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh, backwards=True, ) slice_1 = SliceLayer(lstm_1, indices=-1, axis=1) # out_shape (None, args.lstmDim) slice_1_back = SliceLayer(lstm_1_back, indices=0, axis=1) # out_shape (None, args.lstmDim) concat_1 = ConcatLayer([slice_1, slice_1_back], axis=1) reshape_1 = ReshapeLayer(concat_1, (batchsize, 1, 2, args.lstmDim)) conv2d_1 = Conv2DLayer( reshape_1, num_filters=num_filters, filter_size=filter_size, stride=stride, # (None, 3, 1, 48) nonlinearity=rectify, W=GlorotUniform(), ) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size) # (None, 3, 1, 24) forward_1 = FlattenLayer(maxpool_1) # (None, 72) input_2 = InputLayer((None, maxlen), input_var=input2_var) input_2_mask = InputLayer((None, maxlen), input_var=input2_mask_var) emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_2.params[emb_2.W].remove("trainable") lstm_2 = LSTMLayer( emb_2, num_units=args.lstmDim, mask_input=input_2_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh ) lstm_2_back = LSTMLayer( emb_2, num_units=args.lstmDim, mask_input=input_2_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh, backwards=True, ) slice_2 = SliceLayer(lstm_2, indices=-1, axis=1) slice_2_b = SliceLayer(lstm_2_back, indices=0, axis=1) concat_2 = ConcatLayer([slice_2, slice_2_b]) reshape_2 = ReshapeLayer(concat_2, (batchsize, 1, 2, args.lstmDim)) conv2d_2 = Conv2DLayer( reshape_2, num_filters=num_filters, filter_size=filter_size, stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size) forward_2 = FlattenLayer(maxpool_2) # (None, 72) # elementwisemerge need fix the sequence length mul = ElemwiseMergeLayer([forward_1, forward_2], merge_function=T.mul) sub = AbsSubLayer([forward_1, forward_2], merge_function=T.sub) concat = ConcatLayer([mul, sub]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) if args.task == "sts": network = DenseLayer(hid, num_units=5, nonlinearity=softmax) elif args.task == "ent": network = DenseLayer(hid, num_units=3, nonlinearity=softmax) lambda_val = 0.5 * 1e-4 layers = {lstm_1: lambda_val, conv1d_1: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) return network, penalty
def build_network_2dconv( args, input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var, wordEmbeddings, maxlen=36 ): print ("Building model with 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] num_filters = 100 stride = 1 # CNN_sentence config filter_size = (3, wordDim) pool_size = (maxlen - 3 + 1, 1) # two conv pool layer # filter_size=(10, 100) # pool_size=(4,4) input_1 = InputLayer((None, maxlen), input_var=input1_var) batchsize, seqlen = input_1.input_var.shape # input_1_mask = InputLayer((None, maxlen),input_var=input1_mask_var) emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_1.params[emb_1.W].remove("trainable") # (batchsize, maxlen, wordDim) reshape_1 = ReshapeLayer(emb_1, (batchsize, 1, maxlen, wordDim)) conv2d_1 = Conv2DLayer( reshape_1, num_filters=num_filters, filter_size=(filter_size), stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size) # (None, 100, 1, 1) """ filter_size_2=(4, 10) pool_size_2=(2,2) conv2d_1 = Conv2DLayer(maxpool_1, num_filters=num_filters, filter_size=filter_size_2, stride=stride, nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size_2) #(None, 100, 1, 1) (None, 100, 1, 20) """ forward_1 = FlattenLayer(maxpool_1) # (None, 100) #(None, 50400) input_2 = InputLayer((None, maxlen), input_var=input2_var) # input_2_mask = InputLayer((None, maxlen),input_var=input2_mask_var) emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_2.params[emb_2.W].remove("trainable") reshape_2 = ReshapeLayer(emb_2, (batchsize, 1, maxlen, wordDim)) conv2d_2 = Conv2DLayer( reshape_2, num_filters=num_filters, filter_size=filter_size, stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size) # (None, 100, 1, 1) """ conv2d_2 = Conv2DLayer(maxpool_2, num_filters=num_filters, filter_size=filter_size_2, stride=stride, nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size_2) #(None, 100, 1, 1) """ forward_2 = FlattenLayer(maxpool_2) # (None, 100) # elementwisemerge need fix the sequence length mul = ElemwiseMergeLayer([forward_1, forward_2], merge_function=T.mul) sub = AbsSubLayer([forward_1, forward_2], merge_function=T.sub) concat = ConcatLayer([mul, sub]) concat = ConcatLayer([forward_1, forward_2]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) if args.task == "sts": network = DenseLayer(hid, num_units=5, nonlinearity=softmax) elif args.task == "ent": network = DenseLayer(hid, num_units=3, nonlinearity=softmax) # prediction = get_output(network, {input_1:input1_var, input_2:input2_var}) prediction = get_output(network) loss = T.mean(categorical_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = {conv2d_1: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" # test_prediction = get_output(network, {input_1:input1_var, input_2:input2_var}, deterministic=True) test_prediction = get_output(network, deterministic=True) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) """ train_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True) """ train_fn = theano.function([input1_var, input2_var, target_var], loss, updates=updates, allow_input_downcast=True) if args.task == "sts": """ val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True) """ val_fn = theano.function( [input1_var, input2_var, target_var], [test_loss, test_prediction], allow_input_downcast=True ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) """ val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) """ val_fn = theano.function([input1_var, input2_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn
def event_span_classifier(args, input_var, input_mask_var, target_var, wordEmbeddings, seqlen): print("Building model with LSTM") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] GRAD_CLIP = wordDim args.lstmDim = 150 input = InputLayer((None, seqlen),input_var=input_var) batchsize, seqlen = input.input_var.shape input_mask = InputLayer((None, seqlen),input_var=input_mask_var) emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb_1.W].remove('trainable') lstm = LSTMLayer(emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh) lstm_back = LSTMLayer( emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh, backwards=True) slice_forward = SliceLayer(lstm, indices=-1, axis=1) # out_shape (None, args.lstmDim) slice_backward = SliceLayer(lstm_back, indices=0, axis=1) # out_shape (None, args.lstmDim) concat = ConcatLayer([slice_forward, slice_backward]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, lstm:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, input_mask_var,target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, input_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
def build_network_2dconv(args, input_var, target_var, wordEmbeddings, maxlen=60): print("Building model with 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] num_filters = 100 stride = 1 # CNN_sentence config filter_size = (3, wordDim) pool_size = (maxlen - 3 + 1, 1) input = InputLayer((None, maxlen), input_var=input_var) batchsize, seqlen = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb.params[emb.W].remove("trainable") # (batchsize, maxlen, wordDim) reshape = ReshapeLayer(emb, (batchsize, 1, maxlen, wordDim)) conv2d = Conv2DLayer( reshape, num_filters=num_filters, filter_size=(filter_size), stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool = MaxPool2DLayer(conv2d, pool_size=pool_size) # (None, 100, 1, 1) forward = FlattenLayer(maxpool) # (None, 100) #(None, 50400) hid = DenseLayer(forward, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = {conv2d: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction, target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn
def main(exp_name, embed_data, train_data, train_data_stats, val_data, val_data_stats, test_data, test_data_stats, log_path, batch_size, num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty, reg_coeff): """ Main run function for training model. :param exp_name: :param embed_data: :param train_data: :param train_data_stats: :param val_data: :param val_data_stats: :param test_data: :param test_data_stats: :param log_path: :param batch_size: :param num_epochs: :param unroll_steps: :param learn_rate: :param num_dense: Number of dense fully connected layers to add after concatenation layer :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1 :param penalty: Penalty to use for regularization :param reg_weight: Regularization coeff to use for each layer of network; may want to support different coefficient for different layers :return: """ # Set random seed for deterministic results np.random.seed(0) num_ex_to_train = 30 # Load embedding table table = EmbeddingTable(embed_data) vocab_size = table.sizeVocab dim_embeddings = table.dimEmbeddings embeddings_mat = table.embeddings train_prem, train_hyp = generate_data(train_data, train_data_stats, "left", "right", table, seq_len=unroll_steps) val_prem, val_hyp = generate_data(val_data, val_data_stats, "left", "right", table, seq_len=unroll_steps) train_labels = convertLabelsToMat(train_data) val_labels = convertLabelsToMat(val_data) # To test for overfitting capabilities of model if num_ex_to_train > 0: val_prem = val_prem[0:num_ex_to_train] val_hyp = val_hyp[0:num_ex_to_train] val_labels = val_labels[0:num_ex_to_train] # Theano expressions for premise/hypothesis inputs to network x_p = T.imatrix() x_h = T.imatrix() target_values = T.fmatrix(name="target_output") # Embedding layer for premise l_in_prem = InputLayer((batch_size, unroll_steps)) l_embed_prem = EmbeddingLayer(l_in_prem, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Embedding layer for hypothesis l_in_hyp = InputLayer((batch_size, unroll_steps)) l_embed_hyp = EmbeddingLayer(l_in_hyp, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Ensure embedding matrix parameters are not trainable l_embed_hyp.params[l_embed_hyp.W].remove('trainable') l_embed_prem.params[l_embed_prem.W].remove('trainable') l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp) l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem) # Concatenate sentence embeddings for premise and hypothesis l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum]) l_in = l_concat l_output = l_concat # Add 'num_dense' dense layers with tanh # top layer is softmax if num_dense > 1: for n in range(num_dense): if n == num_dense - 1: l_output = DenseLayer( l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) else: l_in = DenseLayer(l_in, num_units=dense_dim, nonlinearity=lasagne.nonlinearities.tanh) else: l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) network_output = get_output(l_output, { l_in_prem: x_p, l_in_hyp: x_h }) # Will have shape (batch_size, 3) f_dense_output = theano.function([x_p, x_h], network_output, on_unused_input='warn') # Compute cost if penalty == "l2": p_metric = l2 elif penalty == "l1": p_metric = l1 layers = lasagne.layers.get_all_layers(l_output) layer_dict = {l: reg_coeff for l in layers} reg_cost = reg_coeff * regularize_layer_params_weighted( layer_dict, p_metric) cost = T.mean( T.nnet.categorical_crossentropy(network_output, target_values).mean()) + reg_cost compute_cost = theano.function([x_p, x_h, target_values], cost) # Compute accuracy accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1), T.argmax(target_values, axis=-1)), dtype=theano.config.floatX) compute_accuracy = theano.function([x_p, x_h, target_values], accuracy) label_output = T.argmax(network_output, axis=-1) predict = theano.function([x_p, x_h], label_output) # Define update/train functions all_params = lasagne.layers.get_all_params(l_output, trainable=True) updates = lasagne.updates.rmsprop(cost, all_params, learn_rate) train = theano.function([x_p, x_h, target_values], cost, updates=updates) # TODO: Augment embedding layer to allow for masking inputs stats = Stats(exp_name) acc_num = 10 #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size) minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size) print("Training ...") try: total_num_ex = 0 for epoch in xrange(num_epochs): for _, minibatch in minibatches: total_num_ex += len(minibatch) stats.log("Processed {0} total examples in epoch {1}".format( str(total_num_ex), str(epoch))) #prem_batch = val_prem[minibatch] #hyp_batch = val_hyp[minibatch] #labels_batch = val_labels[minibatch] prem_batch = train_prem[minibatch] hyp_batch = train_hyp[minibatch] labels_batch = train_labels[minibatch] train(prem_batch, hyp_batch, labels_batch) cost_val = compute_cost(prem_batch, hyp_batch, labels_batch) stats.recordCost(total_num_ex, cost_val) # Periodically compute and log train/dev accuracy if total_num_ex % (acc_num * batch_size) == 0: train_acc = compute_accuracy(train_prem, train_hyp, train_labels) dev_acc = compute_accuracy(val_prem, val_hyp, val_labels) stats.recordAcc(total_num_ex, train_acc, dataset="train") stats.recordAcc(total_num_ex, dev_acc, dataset="dev") except KeyboardInterrupt: pass
def build_rotation_cnn(input_var=None): # Input layer, as usual: network = lasagne.layers.InputLayer(shape=(None, 1, 40, 40), input_var=input_var) # This time we do not apply input dropout, as it tends to work less well # for convolutional layers. # Convolutional layer with 32 kernels of size 5x5. Strided and padded # convolutions are supported as well; see the docstring. network = Conv2DLayer( network, num_filters=32, filter_size=(5, 5), #nonlinearity=lasagne.nonlinearities.sigmoid, nonlinearity=lasagne.nonlinearities.rectify, W = lasagne.init.Uniform(6.0/64)) #network_middle_output = lasagne.layers.ReshapeLayer(network, shape = (([0], 41472))) # Expert note: Lasagne provides alternative convolutional layers that # override Theano's choice of which implementation to use; for details # please see http://lasagne.readthedocs.org/en/latest/user/tutorial.html. # Max-pooling layer of factor 2 in both dimensions: network = MaxPool2DLayer(network, pool_size=(2, 2)) # Another convolution with 32 5x5 kernels, and another 2x2 pooling: network = Conv2DLayer( network, num_filters=32, filter_size=(5, 5), nonlinearity=lasagne.nonlinearities.rectify, # W = all_weights[2], # b = all_weights[3], W = lasagne.init.Uniform(6.0/64) #nonlinearity=lasagne.nonlinearities.sigmoid ) network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) nin_layer = Conv2DLayer( network, num_filters=32, filter_size=(1, 1), nonlinearity=lasagne.nonlinearities.rectify, # W = all_weights[2], # b = all_weights[3], W = lasagne.init.HeNormal() #nonlinearity=lasagne.nonlinearities.sigmoid ) network_middle_output = lasagne.layers.ReshapeLayer(nin_layer, shape = (([0], 1568))) #network = Conv2DLayer( # network, num_filters=32, filter_size=(1, 1), # nonlinearity=lasagne.nonlinearities.rectify, # W = lasagne.init.GlorotUniform() # #nonlinearity=lasagne.nonlinearities.sigmoid # ) #network = Conv2DLayer( # network, num_filters=32, filter_size=(1, 1), # nonlinearity=lasagne.nonlinearities.rectify, # W = lasagne.init.GlorotUniform() # #nonlinearity=lasagne.nonlinearities.sigmoid # ) #network_middle_output = lasagne.layers.NonlinearityLayer(network_middle_output, nonlinearity = lasagne.nonlinearities.sigmoid) # A fully-connected layer of 256 units with 50% dropout on its inputs: network = lasagne.layers.DenseLayer( lasagne.layers.dropout(nin_layer, p=.5), #network, W = all_weights[4], b = all_weights[5], num_units=256, #nonlinearity=lasagne.nonlinearities.sigmoid nonlinearity=lasagne.nonlinearities.rectify, ) # And, finally, the 10-unit output layer with 50% dropout on its inputs: network = lasagne.layers.DenseLayer( lasagne.layers.dropout(network, p=.5), #network, W = all_weights[6], b = all_weights[7], num_units=10, nonlinearity=lasagne.nonlinearities.softmax) # Weight Decay weight_decay_layers = {nin_layer: 0.001} l1_penalty = regularize_layer_params_weighted(weight_decay_layers, l1) return network, network_middle_output, l1_penalty
def evaluate(self, embedding, train_data, validation_data, test_data, num_classes): """ Evaluates the 'embedding' using a neural network model on a training and validation dataset Parameters ---------- embedding : An embedding which implements the Embedding interface train_data ; A tuple of lists (docs, y) that constitutes the training data validation_data: A tuple of lists (docs, y) that constitutes the validation data test_data : A tuple of lists (docs, y) that constitutes the test data Returns : A float, with the top validation accuracy achieved ------- """ # The data input_docs_train = train_data[0] input_docs_val = validation_data[0] input_docs_test = test_data[0] Y_train = train_data[1] Y_val = validation_data[1] Y_test = test_data[1] # Fetch embeddings expression and represent the document as a sum of the words embeddings_var = embedding.get_embeddings_expr() doc_var = embeddings_var.sum(axis=0).dimshuffle('x',0) # Create theano symbolic variable for the target labels target_var = T.iscalar('target') # Build model using lasagne l_in = lasagne.layers.InputLayer((1, embedding.d), doc_var) l_hid = lasagne.layers.DenseLayer(l_in, num_units=120, nonlinearity=lasagne.nonlinearities.sigmoid) l_out = lasagne.layers.DenseLayer(l_hid, num_units=1, nonlinearity=lasagne.nonlinearities.sigmoid) # TODO: support multiclass # Create a loss expression for training, i.e., a scalar objective we want # to minimize prediction = lasagne.layers.get_output(l_out) prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7) # Clip to prevent zero error, which causes nan loss = lasagne.objectives.binary_crossentropy(prediction, target_var).mean() l2_penalty = regularize_layer_params_weighted({l_hid: 0.001, l_out: 0.001}, l2) loss = loss + l2_penalty # Create update expression for training params = lasagne.layers.get_all_params(l_out, trainable=True) + embedding.get_update_parameter_vars() updates = lasagne.updates.sgd(loss, params, learning_rate=0.01) # Create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.round(prediction), target_var), dtype=theano.config.floatX) # Compile a function performing a training step train_fn = theano.function([target_var] + embedding.get_variable_vars(), [loss, test_acc], updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([target_var] + embedding.get_variable_vars(), [loss, test_acc]) # Helper function for iterating the training set def iterate_data(input_docs, Y, shuffle=True): assert len(input_docs) == len(Y) if shuffle: indices = np.arange(len(input_docs)) np.random.shuffle(indices) for i in range(len(input_docs)): excerpt = indices[i] if shuffle else i yield input_docs[excerpt], Y[excerpt] ## Perform the training patience = 20 # minimum epochs patience_increase = 2 # wait this much longer when a new best is found best_validation_loss = 0 best_test_acc = 0.0 improvement_threshold = 0.999 # a relative improvement of this much is considered significant print("Starting training...") for epoch in range(self.num_epochs): # Time it ! start_time = time.time() # In each epoch, we do a full pass over the training data: train_err = 0 train_acc = 0 train_count = 0 for doc, y in iterate_data(input_docs_train, Y_train, shuffle=False): words = doc.split(" ") if not any([embedding.has(word) for word in words]): continue # If no embeddings, skip this doc err, acc = train_fn(y, *embedding.get_variables(words)) train_err += err train_acc += acc train_count += 1 sys.stdout.write("\r" + "total train acc: \t{:.2f}".format(train_acc * 100 / train_count)) # And a full pass over the validation data data again: val_err = 0 val_acc = 0 val_count = 0 for doc, y in iterate_data(input_docs_val, Y_val, shuffle=False): words = doc.split(" ") if not any([embedding.has(word) for word in words]): continue # If no embeddings, skip this doc err, acc = val_fn(y, *embedding.get_variables(words)) val_err += err val_acc += acc val_count += 1 # Then we print the results for this epoch: sys.stdout.write("\r" + "Epoch {} of {} took {:.3f}s \n".format( epoch + 1, self.num_epochs, time.time() - start_time)) print(" training accuracy:\t\t{:.2f} %".format( train_acc / train_count * 100)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_count * 100)) # Early stopping, if validation accuracy starts to decrease we stop if val_acc > best_validation_loss: # improve patience if loss improvement is good enough if val_acc > best_validation_loss * 1.0/improvement_threshold: patience = max(patience, epoch * patience_increase) # We have a new peak validation accuracy, evaluate on test set best_validation_loss = val_acc test_err = 0 test_acc = 0 test_count = 0 for doc, y in iterate_data(input_docs_test, Y_test, shuffle=False): words = doc.split(" ") if not any([embedding.has(word) for word in words]): continue # If no embeddings, skip this doc err, acc = val_fn(y, *embedding.get_variables(words)) test_err += err test_acc += acc test_count += 1 best_test_acc = test_acc / test_count print(" test accuracy:\t\t{:.2f} %".format( best_test_acc * 100)) if patience <= epoch: break return best_test_acc
def build_mlp(input_var): l_in=lasagne.layers.InputLayer(shape=(None,2),input_var=input_var,W=theano.shared(np.random.normal(0, 0.01, (50, 100)))) l_hid1 = lasagne.layers.DenseLayer(l_in, num_units=4,nonlinearity=lasagne.nonlinearities.sigmoid) l_out = lasagne.layers.DenseLayer(l_hid1, num_units=2,nonlinearity=lasagne.nonlinearities.sigmoid) return l_out input_var = T.fmatrix('inputs') target_var = T.ivector('targets') network = build_mlp(input_var) prediction = lasagne.layers.get_output(network,deterministic=True) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() layers = {build_mlp(input_var): 0.002} l2_penalty = regularize_layer_params_weighted(layers, l2) loss=loss-l2_penalty params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.07, momentum=0.9) test_prediction = lasagne.layers.get_output(network) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,target_var) test_loss=test_loss-l2_penalty test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),dtype=theano.config.floatX) pred=T.eq(T.argmax(test_prediction, axis=1), target_var) train_fn = theano.function([input_var, target_var], loss, updates=updates) val_fn = theano.function([input_var, target_var], [test_acc])
def main(model=MODEL,gradient = GRADIENT, num_epochs=NUM_EPOCHS, num_hidden_units = NUM_HIDDEN_UNITS, bnalg = BNALG, lr_start = LR_START): # Set the Initial Learning Rate; the Final Learning Rate and the number of training epochs LR_start= lr_start LR_fin = 0.01 epochs=num_epochs # LR_decay = (LR_fin/LR_start)**(1./epochs) LR_decay = 1 print("Generating the ImageDataGenerator") #Define the Image Data Generator, which is used for real-time data augmentation while training datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 featurewise_std_normalization=False, # divide inputs by std of the dataset samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) width_shift_range=0.1, # randomly shift images horizontally (fraction of total width) height_shift_range=0.1, # randomly shift images vertically (fraction of total height) horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images #Define Theano tensor variables for input, the labels and the learning rate input=T.tensor4('input') target=T.ivector('target') LR = T.scalar('LR', dtype=theano.config.floatX) #Define the Network print("Generating the cnn network") net=cnn_network(input, bnalg) #Define Training Output Variables print("Compiling the functions") train_output=lasagne.layers.get_output(net['l_out'],input,deterministic=False) ## Get the class probabilities train_pred=train_output.argmax(-1) ## Get the predicted class label train_loss=T.mean(lasagne.objectives.categorical_crossentropy(train_output,target)) #Using Cross-Entropy Loss train_err=T.mean(T.neq(T.argmax(train_output,axis=1), target),dtype=theano.config.floatX) #Compute the mean training precdiction error # Define Validation Output Variables val_output=lasagne.layers.get_output(net['l_out'],input,deterministic=True) val_loss=T.mean(lasagne.objectives.categorical_crossentropy(val_output,target)) val_err = T.mean(T.neq(T.argmax(val_output,axis=1), target),dtype=theano.config.floatX) val_pred=val_output.argmax(-1) # Set L2 regularization coefficient layers={} for k in net.keys(): layers[net[k]]=0.0005 l2_penalty = regularize_layer_params_weighted(layers, l2) train_loss=train_loss+l2_penalty #Define the Gradient Update Rule print("Compiling the functions: extract params") params = lasagne.layers.get_all_params(net['l_out'], trainable=True) #Get list of all trainable network parameters # bnparams = lasagne.layers.get_all_params(net['l_out'], trainable=False) #Get list of all BN untrainable network parameters if gradient == "adagrad": updates = lasagne.updates.adagrad(loss_or_grads=train_loss, params=params, learning_rate=LR) ## Use Adagrad Gradient Descent Learning Algorithm elif gradient == "rmsprop": updates = lasagne.updates.rmsprop(loss_or_grads=train_loss, params=params, learning_rate=LR) elif gradient == "sgd": updates = lasagne.updates.sgd(loss_or_grads=train_loss, params=params, learning_rate=LR) else: print("Invalid gradient name") #Define Theano Functions for Training and Validation #Theano Function for Training print("Compiling the functions: define train function") f_train=theano.function([input,target,LR],[train_loss,train_err],updates=updates,allow_input_downcast=True) # Theano Function for Validation print("Compiling the functions: define val function") f_val=theano.function([input,target],[val_loss,val_err],allow_input_downcast=True) # f_get_params=theano.function([LR],[bnparams],allow_input_downcast=True) #Begin Training print("Beging Training") train_stats,val_stats,per_epoch_params=batch_train(datagen,f_train,f_val,net['l_out'],LR_start,LR_decay,epochs=epochs,\ data_dir="../../data/cifar-10-batches-py/",train_bool=True) #output data list_epoch = [i[0] for i in val_stats] list_val_loss = [i[1] for i in val_stats] list_val_err = [i[2] for i in val_stats] list_val_acc = [1-i for i in list_val_err] epoch_mu = [i[0] for i in per_epoch_params] epoch_lambda = [i[1] for i in per_epoch_params] # epoch_params_mu = [i[0] for i in epoch_params] # epoch_params_std= [i[1] for i in epoch_params] np.savetxt(OUTPUT_DATA_PATH+model+"_"+gradient+"_"+str(num_epochs)+"_"+bnalg+"_"+"epoch.txt",list_epoch) np.savetxt(OUTPUT_DATA_PATH+model+"_"+gradient+"_"+str(num_epochs)+"_"+bnalg+"_"+"loss_val.txt",list_val_loss) np.savetxt(OUTPUT_DATA_PATH+model+"_"+gradient+"_"+str(num_epochs)+"_"+bnalg+"_"+"acc_val.txt",list_val_acc) np.savetxt(OUTPUT_DATA_PATH+model+"_"+gradient+"_"+str(num_epochs)+"_"+bnalg+"_"+"err_val.txt",list_val_err) np.savetxt(OUTPUT_DATA_PATH+model+"_"+gradient+"_"+str(num_epochs)+"_"+bnalg+"_"+"params_mu.txt",epoch_mu) np.savetxt(OUTPUT_DATA_PATH+model+"_"+gradient+"_"+str(num_epochs)+"_"+bnalg+"_"+"params_std.txt",epoch_lambda) print ("Data saved...")
def build_cnn(input_var=None, batch_size = None, class_num=10): # Input layer, as usual: l_in = lasagne.layers.InputLayer(shape=(batch_size, 1, 40, 40), input_var=input_var) loc_network_list = [] for i in range(class_num): loc_l1 = MaxPool2DLayer(l_in, pool_size=(2, 2)) loc_l2 = Conv2DLayer( loc_l1, num_filters=20, filter_size=(5, 5), W=lasagne.init.HeUniform('relu'), name = "loc_l2_%d" %i) loc_l3 = MaxPool2DLayer(loc_l2, pool_size=(2, 2)) loc_l4 = Conv2DLayer(loc_l3, num_filters=20, filter_size=(5, 5), W=lasagne.init.HeUniform('relu'), name = "loc_l4_%d" %i) loc_l5 = lasagne.layers.DenseLayer( loc_l4, num_units=50, W=lasagne.init.HeUniform('relu'), name = "loc_l5_%d" %i) loc_out = lasagne.layers.DenseLayer( loc_l5, num_units=1, W=lasagne.init.Constant(0.0), nonlinearity=lasagne.nonlinearities.identity, name = "loc_out_%d" %i) # Transformer network l_trans1 = RotationTransformationLayer(l_in, loc_out) print "Transformer network output shape: ", l_trans1.output_shape loc_network_list.append(l_trans1) network_transformed = lasagne.layers.ConcatLayer(loc_network_list, axis = 1) network_transformed = lasagne.layers.ReshapeLayer(network_transformed, (-1, 1, 40, 40)) conv_1 = Conv2DLayer( network_transformed, num_filters=32, filter_size=(5, 5), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform()) # Max-pooling layer of factor 2 in both dimensions: network = MaxPool2DLayer(conv_1, pool_size=(2, 2)) # Another convolution with 32 5x5 kernels, and another 2x2 pooling: conv_2 = Conv2DLayer( network, num_filters=32, filter_size=(5, 5), nonlinearity=lasagne.nonlinearities.rectify, W = lasagne.init.GlorotUniform() #nonlinearity=lasagne.nonlinearities.sigmoid ) network = lasagne.layers.MaxPool2DLayer(conv_2, pool_size=(2, 2)) # A fully-connected layer of 256 units with 50% dropout on its inputs: fc1 = lasagne.layers.DenseLayer( lasagne.layers.dropout(network, p=.5), #network, num_units=256, #nonlinearity=lasagne.nonlinearities.sigmoid nonlinearity=lasagne.nonlinearities.rectify, ) # And, finally, the 10-unit output layer with 50% dropout on its inputs: fc2 = lasagne.layers.DenseLayer( lasagne.layers.dropout(fc1, p=.5), nonlinearity=lasagne.nonlinearities.identity, num_units=10, ) fc2_selected = SelectLayer(fc2, 10) # fc2_selected = lasagne.layers.NonlinearityLayer(fc2_selected, nonlinearity=lasagne.nonlinearities.softmax) #network_transformed = lasagne.layers.ReshapeLayer(network_transformed, (-1, 10, 10, 40, 40)) weight_decay_layers = {fc1:0.0, fc2:0.002} l2_penalty = regularize_layer_params_weighted(weight_decay_layers, l2) return fc2_selected, l2_penalty, network_transformed, [conv_1, conv_2, fc1, fc2]
def sgd_optimization(NNInput): """ Demonstrate stochastic gradient descent optimization of a log-linear model :type LearningRate: float :param LearningRate: learning rate used (factor for the stochastic gradient) :type NEpoch: int :param NEpoch: maximal number of epochs to run the optimizer :type PathToData: string :param PathToData: the path of the dataset file """ def normalized_squared_error(a, b, expon): """Computes the element-wise squared normalized difference between two tensors. .. math:: L = ( (p - t) / t )^2 Parameters ---------- a, b : Theano tensor The tensors to compute the squared difference between. Returns ------- Theano tensor An expression for the item-wise squared difference. """ a, b = align_targets(a, b) return T.square((a - b) / T.abs_(b)**expon) # / T.abs_(TargetVar)**(0.0) / T.abs_(b)**expon def weighted_squared_error(a, b, Shift, Power): """Computes the element-wise squared normalized difference between two tensors. .. math:: L = ( (p - t) / t )^2 Parameters ---------- a, b : Theano tensor The tensors to compute the squared difference between. Returns ------- Theano tensor An expression for the item-wise squared difference. """ a, b = align_targets(a, b) Vi = T.maximum(b, Shift) w = T.power(Shift/b, Power) return w * T.square(a - b) # / T.abs_(TargetVar)**(0.0) / T.abs_(b)**expon def align_targets(predictions, targets): """Helper function turning a target 1D vector into a column if needed. This way, combining a network of a single output unit with a target vector works as expected by most users, not broadcasting outputs against targets. Parameters ---------- predictions : Theano tensor Expression for the predictions of a neural network. targets : Theano tensor Expression or variable for corresponding targets. Returns ------- predictions : Theano tensor The predictions unchanged. targets : Theano tensor If `predictions` is a column vector and `targets` is a 1D vector, returns `targets` turned into a column vector. Otherwise, returns `targets` unchanged. """ if (getattr(predictions, 'broadcastable', None) == (False, True) and getattr(targets, 'ndim', None) == 1): targets = as_theano_expression(targets).dimshuffle(0, 'x') return predictions, targets ################################################################################################################################## ### LOADING DATA ################################################################################################################################## print('\nLoading Data ... \n') if (NNInput.TryNNFlg > 0): datasets, datasetsTry, G_MEAN, G_SD, RDataOrig, yDataOrig, yDataDiatOrig = load_data(NNInput) else: datasets, G_MEAN, G_SD, RDataOrig, yDataOrig, yDataDiatOrig = load_data(NNInput) RSetTrain, GSetTrain, ySetTrain, ySetTrainDiat, ySetTrainTriat = datasets[0] RSetValid, GSetValid, ySetValid, ySetValidDiat, ySetValidTriat = datasets[1] #RSetTest, GSetTest, ySetTest, ySetTestDiat, ySetTestTriat = datasets[2] #plot_set(NNInput, RSetTrain.get_value(), ySetTrainDiat.get_value(), RSetValid.get_value(), ySetValidDiat.get_value(), RSetTest.get_value(), ySetTestDiat.get_value()) NNInput.NIn = RSetTrain.get_value(borrow=True).shape[1] NNInput.NOut = ySetTrain.get_value(borrow=True).shape[1] print((' Nb of Input: %i') % NNInput.NIn) print((' Nb of Output: %i \n') % NNInput.NOut) if (NNInput.Model=='ModPIP') or (NNInput.Model=='PIP'): NNInput.NLayers = NNInput.NHid NNInput.NLayers.insert(0,NNInput.NIn) NNInput.NLayers.append(NNInput.NOut) NTrain = RSetTrain.get_value(borrow=True).shape[0] NBatchTrain = NTrain // NNInput.NMiniBatch NValid = RSetValid.get_value(borrow=True).shape[0] #NTest = RSetTest.get_value(borrow=True).shape[0] print((' Nb of Training Examples: %i') % NTrain) print((' Nb of Training Batches: %i') % NBatchTrain) print((' Nb of Validation Examples: %i') % NValid) #print((' Nb of Test Examples: %i \n') % NTest) ###################### # BUILD ACTUAL MODEL # ###################### InputVar = T.dmatrix('Inputs') #InputVar.tag.test_value = numpy.random.randint(100,size=(100,3)) InputVar.tag.test_value = numpy.array([[1.0,2.0,7.0],[3.0,5.0,11.0]]) * 0.529177 TargetVar = T.dmatrix('Targets') #TargetVar.tag.test_value = numpy.random.randint(100,size=(100,1)) Layers = create_nn(NNInput, InputVar, TargetVar) TrainPrediction = lasagne.layers.get_output(Layers[-1]) if (NNInput.LossFunction == 'squared_error'): TrainError = T.sqr(TrainPrediction - TargetVar) TrainLoss = lasagne.objectives.squared_error(TrainPrediction, TargetVar) elif (NNInput.LossFunction == 'normalized_squared_error'): TrainError = T.abs_( (TrainPrediction - TargetVar) / T.abs_(TargetVar)**NNInput.OutputExpon) TrainLoss = normalized_squared_error(TrainPrediction, TargetVar, NNInput.OutputExpon) elif (NNInput.LossFunction == 'huber_loss'): TrainError = T.abs_( (TrainPrediction - TargetVar) ) TrainLoss = lasagne.objectives.huber_loss(TrainPrediction, TargetVar, delta=5) elif (NNInput.LossFunction == 'weighted_squared_error'): TrainError = T.abs_( (TrainPrediction - TargetVar) ) TrainLoss = weighted_squared_error(TrainPrediction, TargetVar, NNInput.Shift, NNInput.Power) if (NNInput.Model == 'ModPIP'): LayersK = {Layers[2]: 1.0, Layers[3]: 1.0} elif (NNInput.Model=='ModPIPPol'): LayersK = {Layers[1]: 1.0} elif (NNInput.Model=='PIP'): LayersK = {Layers[0]: 1.0, Layers[1]: 1} L2Penalty = regularize_layer_params_weighted(LayersK, l2) L1Penalty = regularize_layer_params_weighted(LayersK, l1) #TrainLoss = TrainLoss TrainLoss = TrainLoss.mean() + NNInput.kWeightDecay[0] * L1Penalty + NNInput.kWeightDecay[1] * L2Penalty params = lasagne.layers.get_all_params(Layers[-1], trainable=True) if (NNInput.Method == 'nesterov'): updates = lasagne.updates.nesterov_momentum(TrainLoss, params, learning_rate=NNInput.LearningRate, momentum=NNInput.kMomentum) elif (NNInput.Method == 'rmsprop'): updates = lasagne.updates.rmsprop(TrainLoss, params, learning_rate=NNInput.LearningRate, rho=NNInput.RMSProp[0], epsilon=1e-06) elif (NNInput.Method == 'adamax'): updates = lasagne.updates.adamax(TrainLoss, params, learning_rate=NNInput.LearningRate, beta1=0.9, beta2=0.999, epsilon=1e-08) elif (NNInput.Method == 'amsgrad'): updates = lasagne.updates.amsgrad(TrainLoss, params, learning_rate=NNInput.LearningRate, beta1=0.9, beta2=0.999, epsilon=1e-08) elif (NNInput.Method == 'adam'): updates = lasagne.updates.adam(TrainLoss, params, learning_rate=NNInput.LearningRate, beta1=0.9, beta2=0.999, epsilon=1e-08) elif (NNInput.Method == 'adadelta'): updates = lasagne.updates.adadelta(TrainLoss, params, learning_rate=NNInput.LearningRate, rho=0.95, epsilon=1e-08) TrainFn = theano.function(inputs=[InputVar, TargetVar], outputs=[TrainError, TrainLoss], updates=updates) ValidPrediction = lasagne.layers.get_output(Layers[-1], deterministic=True) if (NNInput.LossFunction == 'squared_error'): ValidError = T.sqr(ValidPrediction - TargetVar) elif (NNInput.LossFunction == 'normalized_squared_error'): ValidError = T.sqr((ValidPrediction - TargetVar) / TargetVar) ValidError = T.sqrt(ValidError.mean()) elif (NNInput.LossFunction == 'huber_loss'): ValidError = T.sqr(ValidPrediction - TargetVar) ValidError = T.sqrt(ValidError.mean()) elif (NNInput.LossFunction == 'weighted_squared_error'): Vi = T.maximum(ValidPrediction, NNInput.Shift) w = T.power(NNInput.Shift/TargetVar, NNInput.Power) ValidError = w * T.sqr(ValidPrediction - TargetVar) ValidError = T.sqrt(ValidError.mean()) ValFn = theano.function(inputs=[InputVar, TargetVar], outputs=ValidError) ############### # TRAIN MODEL # ############### print('\n\nTRAINING ... ') if (NNInput.fvalid < 0): fValid = NBatchTrain * numpy.absolute(NNInput.fvalid) else: fValid = NNInput.fvalid BestValidError = numpy.inf BestIter = 0 TestScore = 0. tStart = timeit.default_timer() iEpoch = 0 LoopingFlg = True iIterTot = 0 Train = [] TrainEpochVec = [] Valid = [] ValidEpochVec = [] iTry = 0 if (NNInput.Model=='ModPIP') or (NNInput.Model == 'ModPIPPol'): xSetTrain = RSetTrain xSetValid = RSetValid #xSetTest = RSetTest xDataOrig = RDataOrig elif (NNInput.Model == 'PIP'): xSetTrain = GSetTrain xSetValid = GSetValid #xSetTest = GSetTest #xDataOrig = GDataOrig # print(xSetTrain) # print(xSetValid) # print(xSetTest) # print(xDataOrig) # print(ySetTrain.get_value()) # print(ySetValid.get_value()) # print(ySetTest.get_value()) # print(yDataOrig) # time.sleep(5) ThisTrainError = 0.0 while (iEpoch < NNInput.NEpoch) and (LoopingFlg): iEpoch += 1 iMiniBatch = 0 TrainErrorVec = [] for TrainBatch in iterate_minibatches(xSetTrain, ySetTrain, NNInput.NMiniBatch, shuffle=True): iMiniBatch += 1 iIterTot = (iEpoch - 1) * NBatchTrain + iMiniBatch TrainInputs, TrainTargets = TrainBatch [TrainErrorTemp, MiniBatchAvgCost] = TrainFn(TrainInputs, TrainTargets) TrainErrorVec = numpy.append(TrainErrorVec, TrainErrorTemp) if (iIterTot + 1) % fValid == 0: ValidErorrVec = [] for ValidBatch in iterate_minibatches(xSetValid, ySetValid, NValid, shuffle=False): ValidInputs, ValidTargets = ValidBatch ValidErorrVec = numpy.append(ValidErorrVec, ValFn(ValidInputs, ValidTargets)) ThisValidError = numpy.sqrt( numpy.mean(ValidErorrVec) ) ValidEpochVec = numpy.append(ValidEpochVec, iEpoch) Valid = numpy.append(Valid, ThisValidError) # fig = plt.figure() # plt.plot(ValidErorrVec, color='lightblue', linewidth=3) # #ax.set_xlim(,) # plt.show() print( '\n iEpoch %i, minibatch %i/%i, training error %f, validation error %f' % (iEpoch, iMiniBatch + 1, NBatchTrain, ThisTrainError, ThisValidError) ) # if we got the best validation score until now if ThisValidError < BestValidError: #improve patience if loss improvement is good enough #if (ThisValidError < BestValidError * NNInput.ImpThold): # NNInput.NPatience = max(NNInput.NPatience, iIterTot * NNInput.NDeltaPatience) BestValidError = ThisValidError BestIter = iIterTot # # test it on the test set # TestErrorVec = [] # for TestBatch in iterate_minibatches(xSetTest, ySetTest, NTest, shuffle=False): # TestInputs, TestTargets = TestBatch # TestErrorVec = numpy.append(TestErrorVec, ValFn(TestInputs, TestTargets)) # TestScore = numpy.mean(TestErrorVec) # print((' iEpoch %i, minibatch %i/%i, test error of best model %f') % (iEpoch, iMiniBatch + 1, NBatchTrain, TestScore)) print(' iEpoch %i, minibatch %i/%i, Best so far') if (NNInput.WriteFinalFlg > 0): for iLayer in range(len(NNInput.NLayers)-1): PathToFldr = NNInput.PathToOutputFldr + Layers[iLayer].name + '/' if not os.path.exists(PathToFldr): os.makedirs(PathToFldr) PathToFile = PathToFldr + 'Weights.npz' numpy.savez(PathToFile, *lasagne.layers.get_all_param_values(Layers[iLayer])) if (NNInput.WriteFinalFlg > 1): if (NNInput.Model == 'ModPIP'): if (iLayer == 0) and (NNInput.BondOrderStr != 'DiatPotFun'): save_parameters_PIP(PathToFldr, Layers[iLayer].Lambda.get_value(), Layers[iLayer].re.get_value()) elif (iLayer > 1): if (NNInput.BiasesFlg): save_parameters(PathToFldr, Layers[iLayer].W.get_value(), Layers[iLayer].b.get_value()) else: save_parameters_NoBiases(PathToFldr, Layers[iLayer].W.get_value()) elif (NNInput.Model == 'ModPIPPol'): if (iLayer == 0) and (NNInput.BondOrderStr != 'DiatPotFun'): save_parameters_PIP(PathToFldr, Layers[iLayer].Lambda.get_value(), Layers[iLayer].re.get_value()) elif (iLayer==1): save_parameters_NoBiases(PathToFldr, Layers[iLayer].W.get_value()) elif (NNInput.Model == 'PIP'): if (NNInput.BiasesFlg): save_parameters(PathToFldr, Layers[iLayer].W.get_value(), Layers[iLayer].b.get_value()) else: save_parameters_NoBiases(PathToFldr, Layers[iLayer].W.get_value()) if (NNInput.TryNNFlg > 1): i=-1 for Ang in NNInput.AngVector: i=i+1 iTry=iTry+1 RSetTry, GSetTry, ySetTry, ySetTryDiat, ySetTryTriat = datasetsTry[i] if (NNInput.Model == 'ModPIP') or (NNInput.Model == 'ModPIPPol'): xSetTry = RSetTry elif (NNInput.Model == 'PIP'): xSetTry = GSetTry NTry = xSetTry.get_value(borrow=True).shape[0] NBatchTry = NTry // NNInput.NMiniBatch yPredTry = lasagne.layers.get_output(Layers[-1], inputs=xSetTry) if (NNInput.TryNNFlg > 2): PathToTryLabels = NNInput.PathToOutputFldr + '/REBestDet.csv.' + str(iTry) else: PathToTryLabels = NNInput.PathToOutputFldr + '/REBestDet.csv.' + str(Ang) yPredTry = T.cast(yPredTry, 'float64') yPredTry = yPredTry.eval() yPredTry = InverseTransformation(NNInput, yPredTry, ySetTryDiat.get_value()) ySetTry = T.cast(ySetTry, 'float64') ySetTry = ySetTry.eval() ySetTry = InverseTransformation(NNInput, ySetTry, ySetTryDiat.get_value()) save_to_plot(PathToTryLabels, 'Evaluated', numpy.concatenate((RSetTry.get_value(), ySetTry, yPredTry), axis=1)) TrainEpochVec = numpy.append(TrainEpochVec, iEpoch) ThisTrainError = numpy.sqrt( numpy.mean(TrainErrorVec) ) Train = numpy.append(Train, ThisTrainError) ############################################################################################################# ### LOADING THE OPTIMAL PARAMETERS for iLayer in range(len(NNInput.NLayers)-1): PathToFldr = NNInput.PathToWeightFldr + Layers[iLayer].name + '/' print(' Loading Parameters for Layer ', iLayer, ' from File ', PathToFldr) if (NNInput.Model == 'ModPIP'): if (iLayer == 0) and (NNInput.BondOrderStr != 'DiatPotFun'): save_parameters_PIP(PathToFldr, Layers[iLayer].Lambda.get_value(), Layers[iLayer].re.get_value()) elif (iLayer > 1): if (NNInput.BiasesFlg): save_parameters(PathToFldr, Layers[iLayer].W.get_value(), Layers[iLayer].b.get_value()) else: save_parameters_NoBiases(PathToFldr, Layers[iLayer].W.get_value()) elif (NNInput.Model == 'ModPIPPol'): if (iLayer == 0) and (NNInput.BondOrderStr != 'DiatPotFun'): save_parameters_PIP(PathToFldr, Layers[iLayer].Lambda.get_value(), Layers[iLayer].re.get_value()) elif (iLayer==1): save_parameters_NoBiases(PathToFldr, Layers[iLayer].W.get_value()) elif (NNInput.Model == 'PIP'): save_parameters(PathToFldr, Layers[iLayer].W.get_value(), Layers[iLayer].b.get_value()) ############################################################################################################# ### Evaluating Model for a Particular Data-Set if (NNInput.TryNNFlg > 0): i=-1 for Ang in NNInput.AngVector: i=i+1 RSetTry, GSetTry, ySetTry, ySetTryDiat, ySetTryTriat = datasetsTry[i] if (NNInput.Model == 'ModPIP') or (NNInput.Model == 'ModPIPPol'): xSetTry = RSetTry elif (NNInput.Model == 'PIP'): xSetTry = GSetTry NTry = xSetTry.get_value(borrow=True).shape[0] NBatchTry = NTry // NNInput.NMiniBatch yPredTry = lasagne.layers.get_output(Layers[-1], inputs=xSetTry) PathToTryLabels = NNInput.PathToOutputFldr + '/REBestDet.csv.' + str(Ang) yPredTry = T.cast(yPredTry, 'float64') yPredTry = yPredTry.eval() yPredTry = InverseTransformation(NNInput, yPredTry, ySetTryDiat.get_value()) ySetTry = T.cast(ySetTry, 'float64') ySetTry = ySetTry.eval() ySetTry = InverseTransformation(NNInput, ySetTry, ySetTryDiat.get_value()) save_to_plot(PathToTryLabels, 'Evaluated', numpy.concatenate((RSetTry.get_value(), ySetTry, yPredTry), axis=1)) ############################################################################################################# ### COMPUTING ERRORS ySetTrain = InverseTransformation(NNInput, ySetTrain.get_value(), ySetTrainDiat.get_value()) ySetValid = InverseTransformation(NNInput, ySetValid.get_value(), ySetValidDiat.get_value()) #ySetTest = InverseTransformation(NNInput, ySetTest.get_value(), ySetTestDiat.get_value()) yPredTrain = lasagne.layers.get_output(Layers[-1], inputs=xSetTrain) yPredTrain = T.cast(yPredTrain, 'float64') yPredTrain = yPredTrain.eval() yPredTrain = InverseTransformation(NNInput, yPredTrain, ySetTrainDiat.get_value()) error_Train = ySetTrain - yPredTrain plot_error(NNInput, error_Train, 'Train') yPredValid = lasagne.layers.get_output(Layers[-1], inputs=xSetValid) yPredValid = T.cast(yPredValid, 'float64') yPredValid = yPredValid.eval() yPredValid = InverseTransformation(NNInput, yPredValid, ySetValidDiat.get_value()) error_Valid = ySetValid - yPredValid plot_error(NNInput, error_Valid, 'Valid') # yPredTest = lasagne.layers.get_output(Layers[-1], inputs=xSetTest) # yPredTest = T.cast(yPredTest, 'float64') # yPredTest = yPredTest.eval() # yPredTest = InverseTransformation(NNInput, yPredTest, ySetTestDiat.get_value()) # error_Test = ySetTest - yPredTest # plot_error(NNInput, error_Test, 'Test') # plot_set(NNInput, RSetTrain.get_value(), ySetTrain, RSetValid.get_value(), ySetValid, RSetTest.get_value(), ySetTest) yPredOrig = lasagne.layers.get_output(Layers[-1], inputs=xDataOrig) yPredOrig = T.cast(yPredOrig, 'float64') yPredOrig = yPredOrig.eval() yPredOrig = InverseTransformation(NNInput, yPredOrig, yDataDiatOrig) plot_scatter(NNInput, yPredOrig, yDataOrig) #plot_overall_error(NNInput, yPredOrig, yDataOrig) plot_history(NNInput, TrainEpochVec, Train, ValidEpochVec, Valid) tEnd = timeit.default_timer() print(('\nOptimization complete. Best validation score of %f obtained at iteration %i, with test performance %f') % (BestValidError, BestIter + 1, TestScore)) print(('\nThe code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((tEnd - tStart) / 60.)), file=sys.stderr)
def build_cnn(input_var=None): """Build the CIFAR-10 model. Args: images: Images returned from distorted_inputs() or inputs(). Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU training runs. # If we only ran this model on a single GPU, we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). # input_layer = InputLayer((None, 3, IMAGE_SIZE, IMAGE_SIZE), input_var=input_var) # conv1 conv1 = Conv2DLayer(input_layer, num_filters=64, filter_size=(5,5), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.0), name="conv1") # pool1 pool1 = MaxPool2DLayer(conv1, pool_size=(3, 3), stride=(2, 2), pad=1) # norm1 norm1 = LocalResponseNormalization2DLayer(pool1, alpha=0.001 / 9.0, beta=0.75, k=1.0, n=9) # conv2 conv2 = Conv2DLayer(norm1, num_filters=64, filter_size=(5,5), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.1), name='conv2') # norm2 norm2 = LocalResponseNormalization2DLayer(conv2, alpha=0.001 / 9.0, beta=0.75, k=1.0, n=9) # pool2 pool2 = MaxPool2DLayer(norm2, pool_size=(3, 3), stride=(2, 2), pad=1) # fc1 fc1 = DenseLayer(pool2, num_units=384, nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.1), name="fc1") # fc2 fc2 = DenseLayer(fc1, num_units=192, nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.1), name="fc2") # fc3 softmax_layer = DenseLayer(fc2, num_units=10, nonlinearity=lasagne.nonlinearities.softmax, W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.0), name="softmax") # Weight Decay weight_decay_layers = {fc1: 0.002, fc2: 0.002} l2_penalty = regularize_layer_params_weighted(weight_decay_layers, l2) return softmax_layer, l2_penalty
def build_network_MyModel( args, input1_var, input1_mask_var, input2_var, intut2_mask_var, wordEmbeddings, maxlen=36, reg=0.5 * 1e-4 ): # need use theano.scan print ("Building model LSTM + Featue Model + 2D Convolution +MLP") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] GRAD_CLIP = wordDim input_1 = InputLayer((None, maxlen), input_var=input1_var) batchsize, seqlen = input_1.input_var.shape input_1_mask = InputLayer((None, maxlen), input_var=input1_mask_var) emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_1.params[emb_1.W].remove("trainable") lstm_1 = LSTMLayer( emb_1, num_units=args.lstmDim, mask_input=input_1_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh ) input_2 = InputLayer((None, maxlen), input_var=input2_var) input_2_mask = InputLayer((None, maxlen), input_var=input2_mask_var) emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_2.params[emb_2.W].remove("trainable") lstm_2 = LSTMLayer( emb_2, num_units=args.lstmDim, mask_input=input_2_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh ) # print "LSTM shape", get_output_shape(lstm_2) # LSTM shape (None, 36, 150) cos_feats = CosineSimLayer([lstm_1, lstm_2]) print "SSSS", get_output_shape(cos_feats) # lstm_1 = SliceLayer(lstm_1, indices=slice(-6, None), axis=1) # lstm_2 = SliceLayer(lstm_2, indices=slice(-6, None), axis=1) # concat = ConcatLayer([lstm_1, lstm_2],axis=2) #(None, 36, 300) """ num_filters = 32 stride = 1 """ filter_size = (10, 10) pool_size = (4, 4) """ filter_size=(3, 10) pool_size=(2,2) reshape = ReshapeLayer(concat, (batchsize, 1, 6, 2*args.lstmDim)) conv2d = Conv2DLayer(reshape, num_filters=num_filters, filter_size=filter_size, nonlinearity=rectify,W=GlorotUniform()) maxpool = MaxPool2DLayer(conv2d, pool_size=pool_size) #(None, 32, 6, 72) """ # Another convolution with 32 5x5 kernels, and another 2x2 pooling: # conv2d = Conv2DLayer(maxpool, num_filters=32, filter_size=(5, 5), nonlinearity=rectify) # maxpool = MaxPool2DLayer(conv2d, pool_size=(2, 2)) # A fully-connected layer of 256 units with 50% dropout on its inputs: # hid = DenseLayer(DropoutLayer(maxpool, p=.2),num_units=128,nonlinearity=rectify) hid = DenseLayer(cos_feats, num_units=10, nonlinearity=sigmoid) if args.task == "sts": network = DenseLayer(hid, num_units=5, nonlinearity=logsoftmax) elif args.task == "ent": network = DenseLayer(hid, num_units=3, nonlinearity=logsoftmax) layers = {lstm_1: reg, hid: reg, network: reg} penalty = regularize_layer_params_weighted(layers, l2) input_dict = { input_1: input1_var, input_2: input2_var, input_1_mask: input1_mask_var, input_2_mask: input2_mask_var, } return network, penalty, input_dict
def build_cnn(input_var=None, batch_size = None): # Input layer, as usual: network = lasagne.layers.InputLayer(shape=(None, 1, 227, 227), input_var=input_var) repeatInput = Repeat(network, 61) network = lasagne.layers.ReshapeLayer(repeatInput, (-1, 1, 227, 227)) network_transformed = RotationTransformationLayer(network, batch_size * 61) network = Conv2DLayer( network_transformed, num_filters=96, filter_size=(11, 11), stride=(4,4), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform()) network = LRN(network, alpha = 0.0001, beta = 0.75, n = 5) # Max-pooling layer of factor 2 in both dimensions: network = MaxPool2DLayer(network, pool_size=(3, 3), stride=(2,2)) # Another convolution with 32 5x5 kernels, and another 2x2 pooling: network = Conv2DLayer( network, num_filters=256, filter_size=(5, 5), pad = 2, nonlinearity=lasagne.nonlinearities.rectify, W = lasagne.init.GlorotUniform() #nonlinearity=lasagne.nonlinearities.sigmoid ) network = LRN(network, alpha = 0.0001, beta = 0.75, n = 5) network = lasagne.layers.MaxPool2DLayer(network, pool_size=(3, 3), stride = (2, 2)) # A fully-connected layer of 256 units with 50% dropout on its inputs: fc1 = lasagne.layers.DenseLayer( network, num_units=256, W = lasagne.init.Normal(0.01), nonlinearity=lasagne.nonlinearities.rectify, ) # And, finally, the 10-unit output layer with 50% dropout on its inputs: fc2 = lasagne.layers.DenseLayer( fc1, num_units=4096, W = lasagne.init.Normal(0.005), nonlinearity=lasagne.nonlinearities.rectify, ) fc3 = lasagne.layers.DenseLayer( lasagne.layers.dropout(fc2, p=.5), num_units=4096, W = lasagne.init.Normal(0.005), nonlinearity=lasagne.nonlinearities.rectify, ) fc4 = lasagne.layers.DenseLayer( lasagne.layers.dropout(fc3, p=.5), num_units=61, nonlinearity=lasagne.nonlinearities.identity, ) network_transformed = lasagne.layers.ReshapeLayer(network_transformed, (-1, 61, 40, 40)) fc4_selected = SelectLayer(fc4, 61) weight_decay_layers = {fc1:0.0, fc2:0.0} l2_penalty = regularize_layer_params_weighted(weight_decay_layers, l2) return fc4, fc4_selected, l2_penalty, network_transformed
def __init_model(self): """Initializes the model and compiles the network For the most part, this consists of setting up some bookkeeping for theano and lasagne, and compiling the theano functions """ logging.info('initializing model') if self.Xshape == None or self.yshape == None: if self.Xshape == None: logging.warning("Tried to compile Neural Net before" "setting input dimensionality") if self.yshape == None: logging.warning("Tried to compile Neural Net before" "setting output dimensionality") raise ShapeError(self.Xshape,self.yshape) # These are theano/lasagne symbolic variable declarationss, # representing... the target vector(traces) target_vector = T.fmatrix('y') # our predictions predictions = lasagne.layers.get_output(self.layer_out) validation_predictions = lasagne.layers.get_output(self.layer_out, deterministic=True) # the loss (diff in objective) for training # using MSE stochastic_loss = lasagne.objectives.squared_error(predictions, target_vector).mean() #print(stochastic_loss) deterministic_loss = lasagne.objectives.squared_error(validation_predictions, target_vector).mean() # using cross entropy #stochastic_loss = lasagne.objectives.categorical_crossentropy(predictions, target_vector).mean() # the loss for validation #deterministic_loss = lasagne.objectives.categorical_crossentropy(test_predictions, target_vector).mean() # calculate loss loss = stochastic_loss # should regularization be used? config = self.config if config: if config.l1_regularization: logging.info("Using L1 regularization") l1_penalty = regularize_layer_params(self.layer_out, l1) * 1e-4 loss += l1_penalty if config.l2_regularization: logging.info("Using L2 regularization with weights") for sublayer in self.layer_in: logging.info("\tinput layer ({1}) weight: {0}".format(self.layer_weights[sublayer],sublayer.name)) logging.info("\toutput layer weight: {0}".format(self.layer_weights[self.layer_out])) l2_penalty = regularize_layer_params_weighted(self.layer_weights, l2) loss += l2_penalty else: logging.info("No regularization") # the network parameters (i.e. weights) all_params = lasagne.layers.get_all_params( self.layer_out) # how to update the weights updates = lasagne.updates.nesterov_momentum( loss_or_grads = loss, params = all_params, learning_rate = 0.1, momentum = 0.9) # The theano functions for training, validating, and tracing. # These get method-level wrappers below logging.info('compiling theano functions') self._train_fn = theano.function( on_unused_input='warn', inputs = [l.input_var for l in self.layer_in]+[target_vector], outputs = [stochastic_loss], updates = updates) self._valid_fn = theano.function( on_unused_input='warn', inputs = [l.input_var for l in self.layer_in]+[target_vector], outputs = [deterministic_loss, validation_predictions]) self._trace_fn = theano.function( on_unused_input='warn', inputs = [l.input_var for l in self.layer_in], outputs = [validation_predictions * self.roi.shape[0] + self.roi.offset[0]])
def build_cnn(input_var=None): """Build the CIFAR-10 model. Args: images: Images returned from distorted_inputs() or inputs(). Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU training runs. # If we only ran this model on a single GPU, we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). # input_layer = InputLayer((None, 3, IMAGE_SIZE, IMAGE_SIZE), input_var=input_var) norm0 = BatchNormLayer(input_layer) # conv1 conv1 = Conv2DLayer(norm0, num_filters=64, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.0), name="conv1") conv1a = Conv2DLayer(conv1, num_filters=64, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.0), name="conv1a") pool1 = MaxPool2DLayer(conv1a, pool_size=(2, 2), stride=(2, 2), pad=0) norm1 = BatchNormLayer(pool1) # pool1 # conv2 conv2 = Conv2DLayer(lasagne.layers.dropout(norm1, p = 0.5), num_filters=128, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.1), name='conv2') conv2a = Conv2DLayer(conv2, num_filters=128, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.1), name='conv2a') pool2 = MaxPool2DLayer(conv2a, pool_size=(2, 2), stride=(2, 2), pad=0) # norm2 norm2 = BatchNormLayer(pool2) # pool2 conv3 = Conv2DLayer(lasagne.layers.dropout(norm2, p = 0.5), num_filters=256, filter_size=(3,3), nonlinearity=lasagne.nonlinearities.rectify, pad='same', W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.1), name='conv3') pool3 = MaxPool2DLayer(conv3, pool_size=(2, 2), stride=(2, 2), pad=0) norm3 = BatchNormLayer(pool3) # fc1 fc1 = DenseLayer(lasagne.layers.dropout(norm3, p = 0.5), num_units=256, nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.1), name="fc1") # fc3 softmax_layer = DenseLayer(lasagne.layers.dropout(fc1, p = 0.5), num_units=10, nonlinearity=lasagne.nonlinearities.softmax, W=lasagne.init.HeNormal(), b=lasagne.init.Constant(0.0), name="softmax") # Weight Decay weight_decay_layers = {fc1: 0.0} l2_penalty = regularize_layer_params_weighted(weight_decay_layers, l2) network_middle_output = lasagne.layers.ReshapeLayer(softmax_layer, shape = (([0], 10))) return softmax_layer, network_middle_output, l2_penalty
def build_cnn(input_var=None, support_var = None, batch_size = None): # Input layer, as usual: network = lasagne.layers.InputLayer(shape=(batch_size, 1, 40, 40), input_var=input_var) support_input = lasagne.layers.InputLayer(shape=(batch_size, 10, 40, 40), input_var=support_var) repeatInput = Repeat(network, 10) network = lasagne.layers.ReshapeLayer(repeatInput, (-1, 1, 40, 40)) network_transformed_TPS = TPSTransformationMatrixLayer(network, batch_size * 10) network_transformed_TPS_reshape = lasagne.layers.ReshapeLayer(network_transformed_TPS, (-1, 10, 40, 40)) after_support_layer = lasagne.layers.ElemwiseMergeLayer([network_transformed_TPS_reshape, support_input], T.mul) after_support_layer = lasagne.layers.ReshapeLayer(after_support_layer, (-1 , 1, 40, 40)) network = Conv2DLayer( after_support_layer, num_filters=32, filter_size=(5, 5), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform()) # Max-pooling layer of factor 2 in both dimensions: network = MaxPool2DLayer(network, pool_size=(2, 2)) # Another convolution with 32 5x5 kernels, and another 2x2 pooling: network = Conv2DLayer( network, num_filters=32, filter_size=(5, 5), nonlinearity=lasagne.nonlinearities.rectify, W = lasagne.init.GlorotUniform() ) network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) # A fully-connected layer of 256 units with 50% dropout on its inputs: fc1 = lasagne.layers.DenseLayer( lasagne.layers.dropout(network, p=.5), #network, num_units=256, nonlinearity=lasagne.nonlinearities.rectify, ) # And, finally, the 10-unit output layer with 50% dropout on its inputs: fc2 = lasagne.layers.DenseLayer( lasagne.layers.dropout(fc1, p=.5), nonlinearity=lasagne.nonlinearities.identity, num_units=10, ) network_transformed = lasagne.layers.ReshapeLayer(after_support_layer, (-1, 10, 40, 40)) fc2_selected = SelectLayer(fc2, 10) weight_decay_layers = {network_transformed_TPS:0.1} l2_penalty = regularize_layer_params_weighted(weight_decay_layers, l2) return fc2, fc2_selected, l2_penalty, network_transformed, network