def _build(self): self.layers.append( SparseLayer(input_dim=self.input_dim, output_dim=FLAGS.hidden1, features_nonzero=self.features_nonzero, act=tf.nn.relu, dropout=self.dropout, logging=self.logging)) self.hidden1 = self.layers[-1](self.inputs) self.layers.append( DenseLayer(input_dim=FLAGS.hidden1, output_dim=FLAGS.hidden2, act=lambda x: x, dropout=self.dropout, logging=self.logging)) self.embeddings = self.layers[-1](self.hidden1) self.layers.append( DenseLayer(input_dim=FLAGS.hidden2, output_dim=self.output_dim, act=lambda x: x, dropout=self.dropout, logging=self.logging)) self.outputs = self.layers[-1](self.embeddings)
def define_network(self, image): with tf.name_scope("Block1"): conv1_1 = ConvLayer(image, 3, 64, name="conv1_1") conv1_2 = ConvLayer(conv1_1, 64, 64, name="conv1_2") pool1 = MaxPoolLayer(conv1_2, name='pool1') with tf.name_scope("Block2"): conv2_1 = ConvLayer(pool1, 64, 128, name="conv2_1") conv2_2 = ConvLayer(conv2_1, 128, 128, name="conv2_2") pool2 = MaxPoolLayer(conv2_2, name='pool2') with tf.name_scope("Block3"): conv3_1 = ConvLayer(pool2, 128, 256, name="conv3_1") conv3_2 = ConvLayer(conv3_1, 256, 256, name="conv3_2") conv3_3 = ConvLayer(conv3_2, 256, 256, name="conv3_3") conv3_4 = ConvLayer(conv3_3, 256, 256, name="conv3_4") pool3 = MaxPoolLayer(conv3_4, name='pool3') with tf.name_scope("Block4"): conv4_1 = ConvLayer(pool3, 256, 512, name="conv4_1") conv4_2 = ConvLayer(conv4_1, 512, 512, name="conv4_2") conv4_3 = ConvLayer(conv4_2, 512, 512, name="conv4_3") conv4_4 = ConvLayer(conv4_3, 512, 512, name="conv4_4") pool4 = MaxPoolLayer(conv4_4, name='pool4') with tf.name_scope("DenseBlock"): fc6 = DenseLayer(pool4, 1024, name='fc6') drop_6 = DropoutLayer(fc6, dropout_rate=self.p) fc7 = DenseLayer(drop_6, 1024, name='fc7') drop_7 = DropoutLayer(fc7, dropout_rate=self.p) return drop_7
def _build_decoder(self): """ Builds the decoder's list""" decoder = [ DenseLayer(num_units=128), LeakyReLU(), DenseLayer(num_units=self.input_dim) ] return decoder
def _build_encoder(self): """ Buils the encoder's list """ encoder = [ DenseLayer(num_units=128, input_shape=self.input_dim), LeakyReLU(), DenseLayer(num_units=self.latent_factors) ] return encoder
def __init__(self): super().__init__() # First layer: a fully connected layer with shape = 784 x 20 self.l1 = DenseLayer(28 * 28, 20, w_std=0.01) # Activation of the first layer: Sigmoid self.sig1 = SigmoidLayer() # Second layer: a fully connected layer with shape = 20 x 1 self.l2 = DenseLayer(20, 1, w_std=0.01) # Activation of the second layer: Sigmoid self.sig2 = SigmoidLayer()
def adddiscriminator(self,num_1,num_2): input_layer = self.feature_layer name = "cate_1" new_layer1 = DenseLayer(input_layer, name= name, num_units=num_1 ) #self.all_layers += (new_layer1,) self.trainable_layers += (new_layer1,) name = "cate_2" new_layer2 = DenseLayer(new_layer1, name = name, num_units=num_2) #self.all_layers += (new_layer2,) self.trainable_layers += (new_layer2,) category = Softmax(new_layer2) self.category_layer = category
def load_pathnet(filename): log = None with open(filename, 'rb') as f: log = pickle.load(f) layers = [] for layer_log in log['layer_logs']: if layer_log['layer_type'] == 'dense': layers.append(DenseLayer.build_from_log(layer_log)) if layer_log['layer_type'] == 'conv': layers.append(ConvLayer.build_from_log(layer_log)) Layer.initialize_whole_network(layers, log['in_shape']) for layer, layer_log in zip(layers, log['layer_logs']): layer.load_layer_log(layer_log) pathnet = PathNet(input_shape=log['in_shape'], width=log['width'], depth=log['depth']) pathnet._layers = layers pathnet.training_counter = log['training_counter'] pathnet.max_modules_pr_layer = log['max_modules_pr_layer'] pathnet.min_modules_pr_layer = log['min_modules_pr_layer'] tasks = [] for task_log in log['task_logs']: task = TaskContainer.build_from_log(task_log) pathnet.path2model(pathnet.random_path(), task) task.layer.set_weights(task_log['layer_weights']) tasks.append(task) pathnet._tasks = tasks return pathnet
def _build(self): for i in range(1, len(self.num_hidden)): # set last layer activation as linear function otherwise use self.act if i == len(self.num_hidden) - 1: act = (lambda x: x) else: act = self.act ####################################################### # TODO: Add a DenseLayer Object as a layer # # Use DenseLayer class to define a new layer # # Please set all its constructor arguments properly # # These arguments include: # # 1. Input and output dimensions # # 2. Weight and Bias Initializer (stddev if needed) # # 3. Activation function # ####################################################### layer = DenseLayer(self.num_hidden[i-1], self.num_hidden[i], act, self.weight_initializer, self.bias_initializer, stddev=self.stddev) ######################################################## # END OF YOUR CODE # ######################################################## # add layer to layers list self.layers.append(layer)
def get_layers_dict(json_list): layers_list = [] for layer_info in json_list: #(layer_name,layer),(layer_activation_name,layer_activation) = Layer(layer_info) if 'dense' in layer_info['layer_type']: layer = DenseLayer(layer_info) elif 'conv2d' in layer_info['layer_type']: layer = Conv2dLayer(layer_info) elif 'flatten' in layer_info['layer_type']: layer = FlattenLayer(layer_info) layer_tup, activation_tup = layer.get_torch_layer() layers_list.append(layer_tup) if activation_tup[1] is not None: layers_list.append(activation_tup) #ret = dict([(item[0], item[1]) for item in layers_list]) ret = collections.OrderedDict(layers_list) return ret
def __init__(self, config, weight_init): super(GCEncoder, self).__init__() self.num_relations = config.num_relations self.num_users = config.num_users self.accum = config.accum self.rgc_layer = RGCLayer(config, weight_init) self.dense_layer = DenseLayer(config, weight_init)
def build_model(self): layers = [] input_shape = np.array( [self.batch_size, self.x_dim, self.x_dim, self.c_dim]) # layer_1: input_layer ==> [n, 28, 28, 1] x = InputLayer(input_shape) layers.append(x) # layer_2: conv_layer [n, 28, 28, 1] ==> [n, 28, 28, 32] x = ConvLayer(x, output_nums=20, kernel=5, strides=1, padding='SAME', name='conv1') layers.append(x) # layer_4: avgpool_layer [n, 28, 28, 32] ==> [n, 14, 14, 32] x = MaxPoolLayer(x, kernel=2, strides=2, paddind='SAME', name='pool1') layers.append(x) # layer_5: conv_layer [n, 14, 14, 32] ==> [n, 14, 14, 64] x = ConvLayer(x, output_nums=50, kernel=5, strides=1, padding='SAME', name='conv2') layers.append(x) # layer_7: avgpool_layer [n, 14, 14, 64] ==> [n, 7, 7, 64] x = MaxPoolLayer(x, kernel=2, strides=2, padding='SAME', name='pool2') layers.append(x) # layer_8: flatten_layer [n, 7, 7, 64] ==> [n, 7*7*64] x = FlattenLayer(x, name='flatten') layers.append(x) # layer_9: fullconnected_layer [n, 3136] ==> [n, 500] x = DenseLayer(x, output_nums=500, name='dense1') layers.append(x) # layer_10: relu_layer [n, 500] ==> [n, 500] x = ReLULayer(x, name='relu1') layers.append(x) # layer_11: fullconnected_layer [n, 500] ==> [n, 10] x = DenseLayer(x, output_nums=10, name='dense2') layers.append(x) # layer_12: softmax_layer [n, 10] ==> [n, 10] x = SoftMaxLayer(x, name='softmax') layers.append(x) self.layers = layers
def get_network(self): self._read_config() input_layer = None layers = [] prev_layer = None for data in self._layers: if data["type"] == "input": input_size = self._input_size * self._input_size output_size = int(data["output_size"]) layer = InputLayer(input_size, output_size) elif data["type"] == "dense": if "output_size" in data: output_size = int(data["output_size"]) else: output_size = self._output_size activation_function_str = data["af"] activation_function = self._lookup_activation_function( activation_function_str) activation_function_d = self._lookup_activation_function_d( activation_function_str) learning_rate = float(data["la"]) layer = DenseLayer(prev_layer.get_output_shape(), output_size, activation_function, activation_function_d, learning_rate) elif data["type"] == "convolution": if prev_layer == None: input_shape = (self._input_size, self._input_size, 1) else: input_shape = prev_layer.get_output_shape() kernel_n = int(data["kernel_n"]) kernel_m = int(data["kernel_m"]) channels_out = int(data["channels"]) output_shape = (kernel_n, kernel_m, channels_out) v_stride = int(data["stride_n"]) h_stride = int(data["stride_m"]) padding = int(data["padding"]) la = float(data["la"]) layer = ConvolutionLayer(input_shape, output_shape, h_stride, v_stride, padding, la) if input_layer == None: input_layer = layer else: layers.append(layer) prev_layer = layer network = Network(input_layer, layers) return network
def binary_mnist(): config = [{'out': 20, 'activation': 'relu'}] input_shape = [28, 28, 1] output_size = 2 depth = 3 width = 10 max_modules_pr_layer = 3 learning_rate = 0.0001 optimizer_type = SGD loss = 'binary_crossentropy' layers = [] for l in range(depth): if len(layers) == 0: layers.append(DenseLayer(width, 'L0', config, flatten=True)) else: layers.append(DenseLayer(width, 'L' + str(l), config)) Layer.initialize_whole_network(layers, input_shape) task = TaskContainer(input_shape, output_size, name='unique_binary_mnist', optimizer=optimizer_type, loss=loss, lr=learning_rate) pathnet = PathNet(input_shape=input_shape, width=width, depth=depth) pathnet._layers = layers pathnet._tasks = [task] pathnet.max_modules_pr_layer = max_modules_pr_layer for layer in pathnet._layers: layer.save_initialized_weights() return pathnet, task
def __init__(self, config, weight_init): super(OurGCEncoder, self).__init__() in_dim = 64 self.num_relations = config.num_relations self.num_users = config.num_users self.num_items = config.num_items self.accum = config.accum self.drop_prob = config.drop_prob self.encoder_user = nn.Linear(3, in_dim) self.encoder_item = nn.Linear(3, in_dim) # self.rgc_layer = RGCLayer(config, weight_init) self.gnn = GNNLayer(config, weight_init, config.model, config.use_uv, self.num_users, self.num_relations) self.dense_layer = DenseLayer(config, weight_init) self.edge_obj_cache = {}
def _build(self): for i in range(1, len(self.num_hidden)): # set last layer activation as linear function otherwise use self.act if i == len(self.num_hidden) - 1: act = (lambda x: x) else: act = self.act layer = DenseLayer(input_dim=self.num_hidden[i - 1], output_dim=self.num_hidden[i], act=act, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, stddev=self.stddev) # add layer to layers list self.layers.append(layer)
def _build(self): self.hidden1 = SparseLayer(input_dim=self.input_dim, output_dim=FLAGS.hidden1, features_nonzero=self.features_nonzero, act=tf.nn.relu, dropout=self.dropout, logging=self.logging)(self.inputs) self.embeddings = DenseLayer(input_dim=FLAGS.hidden1, output_dim=FLAGS.hidden2, act=lambda x: x, dropout=self.dropout, logging=self.logging)(self.hidden1) self.z_mean = self.embeddings self.reconstruction_adjacency = InnerProductDecoder( input_dim=FLAGS.hidden2, act=lambda x: x, logging=self.logging)(self.embeddings) self.reconstructions = tf.reshape(self.reconstruction_adjacency, [-1])
def __init__(self, layers): self._network = [] for layer in layers: layer_type = layer.pop("type") if layer_type == "data": # this is a data layer new_layer = DataLayer(**layer) elif layer_type == "conv": new_layer = ConvLayer(**layer) elif layer_type == "pool": new_layer = PoolLayer(**layer) elif layer_type == "dense": new_layer = DenseLayer(**layer) elif layer_type == "relu": new_layer = ReLULayer() elif layer_type == "loss": new_layer = LossLayer(**layer) else: raise NotImplementedError( "Layer type: {0} not found".format(layer_type)) self._network.append(new_layer) self.initialize()
def addDenseLayer(self, use_batch_norm=False, **kwargs): """ Add dense layer. If batch norm flag is True, the dense layer will be followed by a batch-normalization layer. """ input_layer = self.input_layer if not self.all_layers \ else self.all_layers[-1] self.n_dense_layers += 1 name = "dense%i" % self.n_dense_layers new_layer = DenseLayer(input_layer, name=name, **kwargs) self.all_layers += (new_layer, ) self.trainable_layers += (new_layer, ) if use_batch_norm: self.n_bn_layers += 1 name = "bn%i" % self.n_bn_layers self.all_layers += (BatchNorm(new_layer, name=name), )
def __init__(self, n_points_sample, L, max_num_neighs = 4, descripDim = [2, 4, 8, 16, 32], fittingDim = [16, 8, 4, 2, 1], av = [0.0, 0.0], std = [1.0, 1.0], name='deepMDsimpleEnergy', **kwargs): super(DeepMDClassification, self).__init__(name=name, **kwargs) self.L = L # this should be done on the fly, for now we will keep it here self.n_points_sample = n_points_sample # maximum number of neighbors self.max_num_neighs = max_num_neighs # we normalize the inputs (should help for the training) self.av = av self.std = std self.descripDim = descripDim self.fittingDim = fittingDim self.descriptorDim = descripDim[-1] # we may need to use the tanh here self.layerPyramid = PyramidLayer(descripDim, actfn = tf.nn.relu, initializer = tf.initializers.GlorotUniform()) self.layerPyramidInv = PyramidLayer(descripDim, actfn = tf.nn.relu, initializer = tf.initializers.GlorotUniform()) # we may need to use the relu especially here self.fittingNetwork = PyramidLayer(fittingDim, actfn = tf.nn.relu) self.linfitNet = DenseLayer(2)
# tag::test_setup[] import load_mnist import network from layers import DenseLayer, ActivationLayer training_data, test_data = load_mnist.load_data() # <1> net = network.SequentialNetwork() # <2> net.add(DenseLayer(784, 392)) # <3> net.add(ActivationLayer(392)) net.add(DenseLayer(392, 196)) net.add(ActivationLayer(196)) net.add(DenseLayer(196, 10)) net.add(ActivationLayer(10)) # <4> # <1> First, load training and test data. # <2> Next, initialize a sequential neural network. # <3> You can then add dense and activation layers one by one. # <4> The final layer has size 10, the number of classes to predict. # end::test_setup[] # tag::test_run[] net.train(training_data, epochs=10, mini_batch_size=10, learning_rate=3.0, test_data=test_data) # <1> # <1> You can now easily train the model by specifying train and test data, the number of epochs, the mini-batch size and the learning rate. # end::test_run[]
print("prepocessed_images.shape:", prepocessed_images.shape) prepocessed_images = np.transpose(prepocessed_images, (0,3,1,2)) print("prepocessed_images.shape after transpose:", prepocessed_images.shape) # Train-test split 90%-10% X_train, X_test, y_train, y_test = train_test_split(prepocessed_images, class_label, test_size=0.1) cnn = MyCNN ( ConvLayer(filter_size=3,num_filter=3,num_channel=3), DetectorLayer(), PoolLayer(filter_size=3,stride_size=4,mode="Max"), ConvLayer(filter_size=3,num_filter=3,num_channel=3), DetectorLayer(), PoolLayer(filter_size=3,stride_size=1,mode="Max"), FlattenLayer(), DenseLayer(n_units=100, activation='relu'), DenseLayer(n_units=10, activation='relu'), DenseLayer(n_units=1, activation='sigmoid'), ) cnn.fit( features=X_train, target=y_train, batch_size=5, epochs=5, learning_rate=0.1 ) model_name = 'pretrained_model' cnn.save_model(model_name)
def get_dense_layer(): layer = DenseLayer(2, 1) layer.w = np.asarray([[1.], [2.]]) layer.b = np.asarray([2.]) layer._input_data = np.asarray([[-1, 2]]) return layer
def build_tower(config, seqs, lengths, labels, initializers={}): embedding = DropEmbeddingLayer(config.vocab_size, config.embed_size, output_keep_prob=config.keep_prob, kernel_initializer=initializers.get("embedding_init"), trainable=False) if config.use_elmo: elmo = ELMoLayer(vocab_size=config.vocab_size, embed_size=300, hidden_size=1024, cell_type='lstm', num_layers=2, l2_weight=0.1) rnn = CudnnRNNLayer(num_units=config.hidden_size, num_layers=config.num_layers, direction="bidirectional", kernel_keep_prob=config.rnn_kernel_keep_prob, output_keep_prob=config.keep_prob, cell='lstm', name='rnn') poolers = [MultiHeadAttentivePooling(atn_units=config.atn_units, num_heads=1, atn_kernel_keep_prob=config.keep_prob, atn_weight_keep_prob=config.keep_prob, name="pooler_%d" % i) for i in range(config.num_aspects)] highway = HighwayLayer(num_units=config.hidden_size * 2, num_layers=2, output_keep_prob=config.keep_prob, name='highway') dense = DenseLayer(num_units=4, name="dense") def aspect_logits(seqs, lengths, training=False): embed_seqs = embedding(seqs, training=training) if config.use_elmo: elmo_seqs = elmo(seqs, lengths) embed_seqs = tf.concat([elmo_seqs, embed_seqs], axis=-1) rnn_feat_seqs, _ = rnn(embed_seqs, lengths, training=training) feat_list = [] for pooler in poolers: feat = pooler(rnn_feat_seqs, lengths, training=training) feat = tf.nn.dropout(feat, keep_prob=config.keep_prob) feat_list.append(feat) feats = tf.concat(feat_list, axis=1) feats = highway(feats, training=training) logits = dense(feats) return logits smooth_labels = get_smooth_label(tf.one_hot(labels, depth=4, dtype=tf.float32)) train_logits = aspect_logits(seqs, lengths, training=True) train_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=train_logits, labels=smooth_labels)) train_loss += get_f1_loss(smooth_labels, tf.nn.softmax(train_logits)) if config.use_elmo: train_loss += elmo.reg() vs = tf.get_variable_scope() avger, avg_getter = avg_getter_factory() vs.set_custom_getter(avg_getter) vs.reuse_variables() embedding.build() rnn.set_avger(avger) for pooler in poolers: pooler.build([None, None, config.hidden_size * 2]) dense.build([None, config.hidden_size * 2]) eval_logits = aspect_logits(seqs, lengths, training=False) eval_oh_preds = tf.one_hot(tf.argmax(eval_logits, axis=-1), depth=4, on_value=True, off_value=False, dtype=tf.bool) if config.use_elmo: return train_loss, eval_oh_preds, elmo.saver else: return train_loss, eval_oh_preds, None
(x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(x_train.shape[0], 1, 28 * 28) x_train = x_train.astype('float32') x_train /= 255 y_train = np_utils.to_categorical(y_train) x_test = x_test.reshape(x_test.shape[0], 1, 28 * 28) x_test = x_test.astype('float32') x_test /= 255 y_test = np_utils.to_categorical(y_test) # Network model = NeuralNetwork() model.add(DenseLayer(28 * 28, 100)) model.add(ActivationLayer(tanh, dtanh)) model.add(DenseLayer(100, 50)) model.add(ActivationLayer(tanh, dtanh)) model.add(DenseLayer(50, 10)) model.add(ActivationLayer(tanh, dtanh)) model.use(mse, dmse) model.fit(x_train[0:1000], y_train[0:1000], epochs=35, learning_rate=0.1) # test on 3 samples out = model.predict(x_test[0:3]) print("\n") print("predicted values : ") print(out, end="\n") print("true values : ")
import numpy as np from network import NeuralNetwork from layers import DenseLayer, ActivationLayer from activations import tanh, dtanh from losses import mse, dmse x_train = np.array([[[0, 0]], [[0, 1]], [[1, 0]], [[1, 1]]]) y_train = np.array([[[0]], [[1]], [[1]], [[0]]]) model = NeuralNetwork() model.add(DenseLayer(2, 3)) model.add(ActivationLayer(tanh, dtanh)) model.add(DenseLayer(3, 1)) model.add(ActivationLayer(tanh, dtanh)) model.use(mse, dmse) model.fit(x_train, y_train, epochs=1000, learning_rate=0.1) out = model.predict(x_train) print(out)
fp = open(filepath, "w") config = 0 for epoch in epochs: for lr in learning_rates: for reg in regularizations: for alpha in momentums: mean_loss = 0 mean_validation = 0 for i in range(k): model = NeuralNetwork() model.add(InputLayer(10)) model.add(DenseLayer(50, fanin=10)) model.add(DenseLayer(30, fanin=50)) model.add(OutputLayer(2, fanin=30)) model.compile(size, epoch, lr / size, None, reg, alpha, "mean_squared_error") (train, val) = data.kfolds(index=i, k=k) mean_loss = mean_loss + model.fit(train[0], train[1])[-1] mean_validation = mean_validation + model.evaluate( val[0], val[1]) fp.write("{}, {}, {}, {}, {}, {}, {}\n".format( config, epoch, lr, reg, alpha, mean_loss / k, mean_validation / k)) config = config + 1
(test_images.shape[0], test_images.shape[1] * test_images.shape[2])) training_images *= 1.0 / 255.0 test_images *= 1.0 / 255.0 np.random.seed(1345134) # training_images = training_images[0:1] # training_labels = training_labels[0:1] # print (training_images.shape) # print (training_labels.shape) # print (test_images.shape) # print (test_labels.shape) layers = [DenseLayer(training_images.shape[1], sizes[0], ReLUActivation())] i = 1 while i < len(sizes): layers.append(DenseLayer(sizes[i - 1], sizes[i], ReLUActivation())) i += 1 layers.append(SoftmaxCrossEntropyLayer(sizes[i - 1], training_labels.shape[1])) classifier = Classifier(layers, softmax_cross_entropy) classifier.train(training_images, training_labels, max_iter=max_iter, learning_rate=learning_rate, target_acc=0.999, batch_size=batch_size) predictions = classifier.predict(training_images)
def __init__(self, x, y, args): self.params_theta = [] self.params_lambda = [] self.params_weight = [] if args.dataset == 'mnist': input_size = (None, 1, 28, 28) elif args.dataset == 'cifar10': input_size = (None, 3, 32, 32) else: raise AssertionError if (args.depth - 1) % args.num_blocks != 0: raise ValueError("depth must be num_blocks * n + 1 for some n") # input and initial convolution layers = [InputLayer(input_size)] self.penalty = theano.shared(np.array(0.)) layers.append( Conv2DLayer(args, layers[-1], args.first_output, 3, pad='same', W=lasagne.init.HeNormal(gain='relu'), b=None, nonlinearity=None, name='pre_conv')) self.add_params_to_self(args, layers[-1]) layers.append( BatchNormLayer(layers[-1], name='pre_bn', beta=None, gamma=None)) #self.add_params_to_self(args, layers[-1]) # note: The authors' implementation does *not* have a dropout after the # initial convolution. This was missing in the paper, but important. # if dropout: # layers.append(DropoutLayer(network, dropout)) # dense blocks with transitions in between n = (args.depth - 1) // args.num_blocks for b in range(args.num_blocks): self.dense_block(args, layers, n - 1, args.growth_rate, args.dropout, name_prefix='block%d' % (b + 1)) if b < args.num_blocks - 1: self.transition(args, layers, args.dropout, name_prefix='block%d_trs' % (b + 1)) # post processing until prediction #TODO: treat initialization as hyperparameter, but don't regularize weights layers.append(ScaleLayer(args, layers[-1], name='post_scale')) self.add_params_to_self(args, layers[-1]) layers.append(BiasLayer(args, layers[-1], name='post_shift')) self.add_params_to_self(args, layers[-1]) layers.append( NonlinearityLayer(layers[-1], nonlinearity=rectify, name='post_relu')) layers.append(GlobalPoolLayer(layers[-1], name='post_pool')) #TODO: regularize layers.append( DenseLayer(args, layers[-1], args.classes, nonlinearity=softmax, W=lasagne.init.HeNormal(gain=1), name='output')) self.add_params_to_self(args, layers[-1]) self.layers = layers print(self.params_theta) print(self.params_weight) print(self.params_lambda) #training time: deterministic=False self.y = ll.get_output(layers[-1], x, deterministic=False) self.prediction = T.argmax(self.y, axis=1) # cost function self.loss = T.mean(categorical_crossentropy(self.y, y)) self.lossWithPenalty = T.add(self.loss, self.penalty) #validation time: deterministic=True self.y_det = ll.get_output(layers[-1], x, deterministic=True) self.prediction_det = T.argmax(self.y, axis=1) # cost function self.loss_det = T.mean(categorical_crossentropy(self.y_det, y)) self.lossWithPenalty_det = T.add(self.loss_det, self.penalty) print("loss and losswithpenalty", type(self.loss), type(self.lossWithPenalty))
def init_comp_graph(self, args, vecs, pretrained, mappings, invmappings, trans_length, feat_dim, log): keep_prob = args.keep_prob if self.train else 1.0 feat_shape = [5] if args.transsys == 'Cov' else ([feat_dim, 5] if self.train else [self.sent_length, 5]) # build computational graph log.info('Building computational graph, this might take a while...') # POS BiLSTM log.debug('Building computational graph for the POS-BiLSTM...') word_emb_dim = vecs.shape[1] # Uppercased words are initialized with lowercased vectors, but finetuned separately pretrained_base = tf.Variable(vecs[:pretrained], trainable=False) pretrained_delta = tf.Variable(np.zeros(vecs[:pretrained].shape, dtype=floatX)) pretrained_emb = tf.add(pretrained_base, pretrained_delta) random_emb = tf.Variable(vecs[pretrained:]) embeddings = tf.concat([pretrained_emb, random_emb], 0) self.words = tf.placeholder(tf.int32, [args.batch_size, self.sent_length]) self.words2 = tf.placeholder(tf.int32, [args.batch_size, self.sent_length]) self.sent_lengths = tf.placeholder(tf.int32, [args.batch_size]) word_emb = tf.nn.embedding_lookup(embeddings, self.words) word_emb2 = tf.nn.embedding_lookup(embeddings, self.words2) with tf.variable_scope('bilstm1'): lstm_fw = rnn_cell.MultiRNNCell([rnn_cell.BasicLSTMCell(args.hidden_size, state_is_tuple=True) for _ in range(args.layers)], state_is_tuple=True) lstm_bw = rnn_cell.MultiRNNCell([rnn_cell.BasicLSTMCell(args.hidden_size, state_is_tuple=True) for _ in range(args.layers)], state_is_tuple=True) bilstm_outputs, _ = rnn.bidirectional_dynamic_rnn(lstm_fw, lstm_bw, word_emb, sequence_length=self.sent_lengths, dtype=tf.float32) # POS self.gold_pos = tf.placeholder(tf.int32, [args.batch_size, self.sent_length]) if args.fpos: self.gold_fpos = tf.placeholder(tf.int32, [args.batch_size, self.sent_length]) # POS system log.debug('Building computational graph for the POS-tagging system...') if not args.no_pos: log.debug('Building computational graph for dense layers following Parse-BiLSTM...') pos_densesizes = [args.hidden_size] + [int(x) for x in args.pos_dense_layers.split(',')] + [args.pos_emb_dim, len(mappings['pos'])] pos_denselayers = len(pos_densesizes) - 1 pos_dense_inputs = [tf.reshape(x, [-1, args.hidden_size]) for x in bilstm_outputs] pos_dense = [MergeLayer(pos_densesizes[0], pos_densesizes[0], pos_densesizes[1], keepProb=keep_prob, combination='affine')] pos_dense += [DenseLayer(pos_densesizes[i], pos_densesizes[i+1], keepProb=keep_prob) for i in xrange(1, pos_denselayers - 1)] # split representations for head and dependent pos_dense += [DenseLayer(pos_densesizes[-2], pos_densesizes[-1], keepProb=keep_prob, nl=lambda x:x)] pos_dense_intermediate = pos_dense[0](pos_dense_inputs[0], pos_dense_inputs[1]) for l in xrange(1, pos_denselayers-1): pos_dense_intermediate = pos_dense[l](pos_dense_intermediate) pos_dense_outputs = tf.reshape(pos_dense[-1](pos_dense_intermediate), [args.batch_size, -1, len(mappings['pos'])]) if args.fpos: fpos_dense = DenseLayer(args.pos_emb_dim, len(mappings['fpos']), keepProb=keep_prob, nl=lambda x:x) fpos_dense_outputs = tf.reshape(fpos_dense(pos_dense_intermediate), [args.batch_size, -1, len(mappings['fpos'])]) else: pos_dense_outputs = [None for _ in range(args.batch_size)] pos_trainables = tf.Variable(tf.truncated_normal((len(mappings['pos']), args.pos_emb_dim)), dtype=tf.float32, name='pos_trainables') pos_untrainable = tf.Variable(tf.zeros((1, args.pos_emb_dim), dtype=tf.float32), trainable=False) pos_embeddings = tf.concat([pos_trainables, pos_untrainable], 0) pos_loss_pred_ = lambda i: self.pos_loss_pred(i, pos_embeddings, pos_dense_outputs[i], len(mappings['pos']), self.gold_pos, pos_trainables) if self.train: pos_losses = tf.multiply(args.pos_mult, tf.map_fn(lambda i: pos_loss_pred_(i)[0], tf.range(args.batch_size), parallel_iterations=args.batch_size, dtype=tf.float32)) else: self.pos_preds = tf.map_fn(lambda i: pos_loss_pred_(i)[0], tf.range(args.batch_size), parallel_iterations=args.batch_size) self.pos_embs = tf.map_fn(lambda i: pos_loss_pred_(i)[1], tf.range(args.batch_size), parallel_iterations=args.batch_size, dtype=tf.float32) if args.fpos: fpos_trainables = tf.Variable(tf.truncated_normal((len(mappings['fpos']), args.pos_emb_dim)), dtype=tf.float32, name='fpos_trainables') fpos_untrainable = tf.Variable(tf.zeros((1, args.pos_emb_dim), dtype=tf.float32), trainable=False) fpos_embeddings = tf.concat([fpos_trainables, fpos_untrainable], 0) fpos_loss_pred_ = lambda i: self.pos_loss_pred(i, fpos_embeddings, fpos_dense_outputs[i], len(mappings['fpos']), self.gold_fpos, fpos_trainables) if self.train: fpos_losses = tf.multiply(args.pos_mult, tf.map_fn(lambda i: fpos_loss_pred_(i)[0], tf.range(args.batch_size), parallel_iterations=args.batch_size, dtype=tf.float32)) pos_losses = pos_losses + fpos_losses else: self.fpos_preds = tf.map_fn(lambda i: fpos_loss_pred_(i)[0], tf.range(args.batch_size), parallel_iterations=args.batch_size) self.fpos_embs = tf.map_fn(lambda i: fpos_loss_pred_(i)[1], tf.range(args.batch_size), parallel_iterations=args.batch_size, dtype=tf.float32) bilstm_outputs = tf.concat([bilstm_outputs[0], bilstm_outputs[1]], 2) # Concatenate tagger BiLSTM outputs as Parser BiLSTM input concat_list = [bilstm_outputs] dim = args.hidden_size * 2 concat_list += [self.pos_embs] dim += args.pos_emb_dim if args.fpos: concat_list += [self.fpos_embs] dim += args.pos_emb_dim bilstm2_inputs = tf.reshape(tf.concat(concat_list, 2), [args.batch_size, -1, dim]) # Parse BiLSTM log.debug('Building computational graph for the Parse-BiLSTM...') with tf.variable_scope('bilstm2'): lstm2_fw = rnn_cell.MultiRNNCell([rnn_cell.BasicLSTMCell(args.hidden_size, state_is_tuple=True) for _ in range(args.layers2)], state_is_tuple=True) lstm2_bw = rnn_cell.MultiRNNCell([rnn_cell.BasicLSTMCell(args.hidden_size, state_is_tuple=True) for _ in range(args.layers2)], state_is_tuple=True) bilstm2_outputs, _ = rnn.bidirectional_dynamic_rnn(lstm2_fw, lstm2_bw, bilstm2_inputs, sequence_length=self.sent_lengths, dtype=tf.float32) # Dense layer(s) log.debug('Building computational graph for dense layers following Parse-BiLSTM...') densesizes = [args.hidden_size] + [int(x) for x in args.dense_layers.split(',')] + [args.rel_emb_dim] denselayers = len(densesizes) - 1 dense_inputs = [tf.reshape(x, [-1, args.hidden_size]) for x in bilstm2_outputs] if denselayers == 1: dense = [[MergeLayer(densesizes[0], densesizes[0], densesizes[1], keepProb=keep_prob, combination='affine') for _ in xrange(2)]] dense_outputs = [dense[0][j](dense_inputs[0], dense_inputs[1]) for j in xrange(2)] else: dense = [MergeLayer(densesizes[0], densesizes[0], densesizes[1], keepProb=keep_prob, combination='affine')] dense += [DenseLayer(densesizes[i], densesizes[i+1], keepProb=keep_prob) for i in xrange(1, denselayers - 1)] # split representations for head and dependent dense += [[DenseLayer(densesizes[-2], densesizes[-1], keepProb=keep_prob) for _ in xrange(2)]] dense_outputs = dense[0](dense_inputs[0], dense_inputs[1]) for l in xrange(1, denselayers-1): dense_outputs = dense[l](dense_outputs) dense_outputs = [dense[-1][j](dense_outputs) for j in xrange(2)] dense_outputs = [tf.reshape(x, [args.batch_size, -1, args.rel_emb_dim]) for x in dense_outputs] self.combined_head = dense_outputs[0] self.combined_dep = dense_outputs[1] # transition system log.debug('Building computational graph for the transition system...') if self.train: self.trans_feat_ids = tf.placeholder(tf.int32, [args.batch_size, trans_length] + feat_shape) self.trans_feat_sizes = tf.placeholder(tf.int32, [args.batch_size, trans_length]) self.trans_labels = tf.placeholder(tf.int32, [args.batch_size, trans_length]) self.trans_lengths = tf.placeholder(tf.int32, [args.batch_size]) else: self.trans_feat_ids = tf.placeholder(tf.int32, [None] + feat_shape) self.trans_feat_sizes = tf.placeholder(tf.int32, [None]) self.rel_merge = MergeLayer(args.rel_emb_dim, args.rel_emb_dim, args.rel_emb_dim, keepProb=keep_prob, combination=args.combination) if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3': self.rel_dense = DenseLayer(args.rel_emb_dim, len(mappings['rel']), nl=lambda x:x) transition_dense = MergeLayer(args.rel_emb_dim, args.rel_emb_dim, 1, nl=lambda x:x, combination=args.combination) self.transition_logit = transition_dense(tf.reshape(self.combined_head, [-1, args.rel_emb_dim]), tf.reshape(self.combined_dep, [-1, args.rel_emb_dim])) self.transition_logit = tf.reshape(self.transition_logit, (args.batch_size, -1)) elif args.transsys in ['AER', 'AES', 'Cov']: self.rel_dense = DenseLayer(args.rel_emb_dim * 4, 2 + 2 * len(mappings['rel']), nl=lambda x:x) elif args.transsys in ['ASd', 'AH']: self.rel_dense = DenseLayer(args.rel_emb_dim * 4, 1 + 2 * len(mappings['rel']), nl=lambda x:x) SHIFT = mappings['action']['Shift'] if self.train: if args.transsys == 'NCov' or args.transsys == 'Cov3' : trans_loss_f = lambda i, j: self.NCov_transition_loss_pred(i, j, self.combined_head[i], self.combined_dep[i], self.transition_logit[i], SHIFT) elif args.transsys == 'Cov2': trans_loss_f = lambda i, j: self.NCov_transition_loss_pred(i, j, self.combined_head[i], self.combined_dep[i], self.transition_logit[i], SHIFT) else: trans_loss_f = lambda i, j: self.traditional_transition_loss_pred(i, j, self.combined_head[i], self.combined_dep[i]) def _ex_loss(i): trans_loss = tf.reduce_sum(tf.map_fn(lambda j: trans_loss_f(i, j), tf.range(self.trans_lengths[i]), dtype=tf.float32, parallel_iterations=100)) if not args.no_pos: loss = tf.add(pos_losses[i], trans_loss) else: loss = trans_loss return loss losses = tf.map_fn(_ex_loss, tf.range(args.batch_size), dtype=tf.float32, parallel_iterations=100) self._loss = tf.reduce_mean(losses) else: self.combined_head_placeholder = tf.placeholder(tf.float32, (None, self.sent_length, args.rel_emb_dim)) self.combined_dep_placeholder = tf.placeholder(tf.float32, (None, self.sent_length, args.rel_emb_dim)) if args.transsys == 'NCov' or args.transsys == 'Cov3': self.trans_logit_placeholder = tf.placeholder(tf.float32, (None, self.sent_length)) trans_pred = lambda i, k: self.NCov_transition_loss_pred(i, k, self.combined_head_placeholder[i], self.combined_dep_placeholder[i], self.trans_logit_placeholder[i], SHIFT) self.pred_output_size = self.sent_length * len(mappings['rel']) + 1 elif args.transsys == 'Cov2': self.trans_logit_placeholder = tf.placeholder(tf.float32, (None, self.sent_length)) trans_pred = lambda i, k: self.NCov_transition_loss_pred(i, k, self.combined_head_placeholder[i], self.combined_dep_placeholder[i], self.trans_logit_placeholder[i], SHIFT) self.pred_output_size = self.sent_length * len(mappings['rel']) + 1 #Son 2 (NA y SH), pero volvemos a poner 1 porque el NA pasa a ser una arc-tran else: trans_pred = lambda i, k: self.traditional_transition_loss_pred(i, k, self.combined_head_placeholder[i], self.combined_dep_placeholder[i]) if args.transsys in ['AES', 'AER', 'Cov']: self.pred_output_size = 2 * len(mappings['rel']) + 2 elif args.transsys in ['ASd', 'AH']: self.pred_output_size = 2 * len(mappings['rel']) + 1 self._trans_predictors = [[trans_pred(i,k) for k in range(args.beam_size)] for i in xrange(args.batch_size)]
def main(num_epochs=10, k=100, batch_size=128, display_freq=100, save_freq=1000, load_previous=False, attention=True, word_by_word=True, p=0, mode='word_by_word'): print('num_epochs: {}'.format(num_epochs)) print('k: {}'.format(k)) print('batch_size: {}'.format(batch_size)) print('display_frequency: {}'.format(display_freq)) print('save_frequency: {}'.format(save_freq)) print('load previous: {}'.format(load_previous)) print('attention: {}'.format(attention)) print('word_by_word: {}'.format(word_by_word)) save_filename = './snli/{}_model.npz'.format(mode) print("Building network ...") premise_var = T.imatrix('premise_var') premise_mask = T.imatrix('premise_mask') hypo_var = T.imatrix('hypo_var') hypo_mask = T.imatrix('hypo_mask') unchanged_W = pickle.load(open('./snli/unchanged_W.pkl', 'rb')) unchanged_W = unchanged_W.astype('float32') unchanged_W_shape = unchanged_W.shape oov_in_train_W = pickle.load(open('./snli/oov_in_train_W.pkl', 'rb')) oov_in_train_W = oov_in_train_W.astype('float32') oov_in_train_W_shape = oov_in_train_W.shape print('unchanged_W.shape: {0}'.format(unchanged_W_shape)) print('oov_in_train_W.shape: {0}'.format(oov_in_train_W_shape)) # hyperparameters learning_rate = 0.001 l2_weight = 0. #Input layers l_premise = lasagne.layers.InputLayer(shape=(None, premise_max), input_var=premise_var) l_premise_mask = lasagne.layers.InputLayer(shape=(None, premise_max), input_var=premise_mask) l_hypo = lasagne.layers.InputLayer(shape=(None, hypothesis_max), input_var=hypo_var) l_hypo_mask = lasagne.layers.InputLayer(shape=(None, hypothesis_max), input_var=hypo_mask) #Embedded layers premise_embedding = EmbeddedLayer(l_premise, unchanged_W, unchanged_W_shape, oov_in_train_W, oov_in_train_W_shape, p=p) #weights shared with premise_embedding hypo_embedding = EmbeddedLayer( l_hypo, unchanged_W=premise_embedding.unchanged_W, unchanged_W_shape=unchanged_W_shape, oov_in_train_W=premise_embedding.oov_in_train_W, oov_in_train_W_shape=oov_in_train_W_shape, p=p, dropout_mask=premise_embedding.dropout_mask) #Dense layers l_premise_linear = DenseLayer(premise_embedding, k, nonlinearity=lasagne.nonlinearities.linear) l_hypo_linear = DenseLayer(hypo_embedding, k, W=l_premise_linear.W, b=l_premise_linear.b, nonlinearity=lasagne.nonlinearities.linear) encoder = Encoder(l_premise_linear, k, peepholes=False, mask_input=l_premise_mask) #initialized with encoder final hidden state decoder = Decoder(l_hypo_linear, k, cell_init=encoder, peepholes=False, mask_input=l_hypo_mask, encoder_mask_input=l_premise_mask, attention=attention, word_by_word=word_by_word) if p > 0.: print('apply dropout rate {} to decoder'.format(p)) decoder = lasagne.layers.DropoutLayer(decoder, p) l_softmax = lasagne.layers.DenseLayer( decoder, num_units=3, nonlinearity=lasagne.nonlinearities.softmax) target_var = T.ivector('target_var') #lasagne.layers.get_output produces a variable for the output of the net prediction = lasagne.layers.get_output(l_softmax, deterministic=False) #The network output will have shape (n_batch, 3); loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) cost = loss.mean() if l2_weight > 0.: #apply l2 regularization print('apply l2 penalty to all layers, weight: {}'.format(l2_weight)) regularized_layers = {encoder: l2_weight, decoder: l2_weight} l2_penalty = lasagne.regularization.regularize_network_params( l_softmax, lasagne.regularization.l2) * l2_weight cost += l2_penalty #Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_softmax, trainable=True) #Compute adam updates for training print("Computing updates ...") updates = lasagne.updates.adam(cost, all_params, learning_rate=learning_rate) test_prediction = lasagne.layers.get_output(l_softmax, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # lasagne.objectives.categorical_accuracy() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Theano functions for training and computing cost print("Compiling functions ...") train_fn = theano.function( [premise_var, premise_mask, hypo_var, hypo_mask, target_var], cost, updates=updates) val_fn = theano.function( [premise_var, premise_mask, hypo_var, hypo_mask, target_var], [test_loss, test_acc]) print("Training ...") print('train_data.shape: {0}'.format(train_data.shape)) print('val_data.shape: {0}'.format(val_data.shape)) print('test_data.shape: {0}'.format(test_data.shape)) try: # Finally, launch the training loop. print("Training started...") # iterate over epochs: for epoch in range(num_epochs): # In each epoch, do a full pass over the training data: shuffled_train_data = train_data.reindex( np.random.permutation(train_data.index)) train_err = 0 train_acc = 0 train_batches = 0 start_time = time.time() display_at = time.time() save_at = time.time() for start_i in range(0, len(shuffled_train_data), batch_size): batched_data = shuffled_train_data[start_i:start_i + batch_size] ps, p_masks, hs, h_masks, labels = prepare(batched_data) train_err += train_fn(ps, p_masks, hs, h_masks, labels) err, acc = val_fn(ps, p_masks, hs, h_masks, labels) train_acc += acc train_batches += 1 # display if train_batches % display_freq == 0: print("Seen {:d} samples, time used: {:.3f}s".format( start_i + batch_size, time.time() - display_at)) print(" current training loss:\t\t{:.6f}".format( train_err / train_batches)) print(" current training accuracy:\t\t{:.6f}".format( train_acc / train_batches)) # do tmp save model if train_batches % save_freq == 0: print( 'saving to ..., time used {:.3f}s'.format(time.time() - save_at)) np.savez(save_filename, *lasagne.layers.get_all_param_values(l_softmax)) save_at = time.time() # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for start_i in range(0, len(val_data), batch_size): batched_data = val_data[start_i:start_i + batch_size] ps, p_masks, hs, h_masks, labels = prepare(batched_data) err, acc = val_fn(ps, p_masks, hs, h_masks, labels) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" training accuracy:\t\t{:.2f} %".format( train_acc / train_batches * 100)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for start_i in range(0, len(test_data), batch_size): batched_data = test_data[start_i:start_i + batch_size] ps, p_masks, hs, h_masks, labels = prepare(batched_data) err, acc = val_fn(ps, p_masks, hs, h_masks, labels) test_err += err test_acc += acc test_batches += 1 # print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100)) filename = './snli/{}_model_epoch{}.npz'.format(mode, epoch + 1) print('saving to {}'.format(filename)) np.savez(filename, *lasagne.layers.get_all_param_values(l_softmax)) # Optionally, you could now dump the network weights to a file like this: # np.savez('model.npz', *lasagne.layers.get_all_param_values(network)) # # And load them again later on like this: # with np.load('model.npz') as f: # param_values = [f['arr_%d' % i] for i in range(len(f.files))] # lasagne.layers.set_all_param_values(network, param_values) except KeyboardInterrupt: print('exit ...')