class DNN(object): def __init__( self, numpy_rng, theano_rng=None, cfg=None, # the network configuration dnn_shared=None, shared_layers=[], input=None): self.layers = [] self.params = [] self.delta_params = [] self.cfg = cfg self.n_ins = cfg.n_ins self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout self.pool_size = cfg.pool_size self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg self.non_updated_layers = cfg.non_updated_layers if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.ivector('y') for i in xrange(self.hidden_layers_number): # construct the hidden layer if i == 0: input_size = self.n_ins layer_input = self.x else: input_size = self.hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output W = None b = None if (i in shared_layers): W = dnn_shared.layers[i].W b = dnn_shared.layers[i].b if self.do_maxout == True: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, W=W, b=b, activation=(lambda x: 1.0 * x), do_maxout=True, pool_size=self.pool_size) else: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W=W, b=b, activation=self.activation) # add the layer to our list of layers self.layers.append(hidden_layer) # if the layer index is included in self.non_updated_layers, parameters of this layer will not be updated if (i not in self.non_updated_layers): self.params.extend(hidden_layer.params) self.delta_params.extend(hidden_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(input=self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) if self.n_outs > 0: self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum() def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = collections.OrderedDict() for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam * learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] if self.max_col_norm is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) train_fn = theano.function( inputs=[ index, theano.Param(learning_rate, default=0.0001), theano.Param(momentum, default=0.5) ], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }) valid_fn = theano.function( inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) return train_fn, valid_fn def build_extract_feat_function(self, output_layer): feat = T.matrix('feat') out_da = theano.function([feat], self.layers[output_layer].output, updates=None, givens={self.x: feat}, on_unused_input='warn') return out_da def build_finetune_functions_kaldi(self, train_shared_xy, valid_shared_xy): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = collections.OrderedDict() for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam * learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] if self.max_col_norm is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) train_fn = theano.function(inputs=[ theano.Param(learning_rate, default=0.0001), theano.Param(momentum, default=0.5) ], outputs=self.errors, updates=updates, givens={ self.x: train_set_x, self.y: train_set_y }) valid_fn = theano.function(inputs=[], outputs=self.errors, givens={ self.x: valid_set_x, self.y: valid_set_y }) return train_fn, valid_fn def write_model_to_raw(self, file_path): # output the model to tmp_path; this format is readable by PDNN _nnet2file(self.layers, filename=file_path) def write_model_to_kaldi(self, file_path, with_softmax=True): # determine whether it's BNF based on layer sizes output_layer_number = -1 for layer_index in range(1, self.hidden_layers_number - 1): cur_layer_size = self.hidden_layers_sizes[layer_index] prev_layer_size = self.hidden_layers_sizes[layer_index - 1] next_layer_size = self.hidden_layers_sizes[layer_index + 1] if cur_layer_size < prev_layer_size and cur_layer_size < next_layer_size: output_layer_number = layer_index + 1 break layer_number = len(self.layers) if output_layer_number == -1: output_layer_number = layer_number fout = open(file_path, 'wb') for i in xrange(output_layer_number): activation_text = '<' + self.cfg.activation_text + '>' if i == ( layer_number - 1 ) and with_softmax: # we assume that the last layer is a softmax layer activation_text = '<softmax>' W_mat = self.layers[i].W.get_value() b_vec = self.layers[i].b.get_value() input_size, output_size = W_mat.shape W_layer = [] b_layer = '' for rowX in xrange(output_size): W_layer.append('') for x in xrange(input_size): for t in xrange(output_size): W_layer[t] = W_layer[t] + str(W_mat[x][t]) + ' ' for x in xrange(output_size): b_layer = b_layer + str(b_vec[x]) + ' ' fout.write('<affinetransform> ' + str(output_size) + ' ' + str(input_size) + '\n') fout.write('[' + '\n') for x in xrange(output_size): fout.write(W_layer[x].strip() + '\n') fout.write(']' + '\n') fout.write('[ ' + b_layer.strip() + ' ]' + '\n') if activation_text == '<maxout>': fout.write(activation_text + ' ' + str(output_size / self.pool_size) + ' ' + str(output_size) + '\n') else: fout.write(activation_text + ' ' + str(output_size) + ' ' + str(output_size) + '\n') fout.close()
def __init__(self, numpy_rng, theano_rng=None, cfg = None, # the network configuration dnn_shared = None, shared_layers=[], input = None): self.cfg = cfg self.params = [] self.delta_params = [] self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size self.max_col_norm = cfg.max_col_norm print self.max_col_norm self.layers = [] self.lstm_layers = [] self.fc_layers = [] # 1. lstm self.lstm_layers_sizes = cfg.lstm_layers_sizes self.lstm_layers_number = len(self.lstm_layers_sizes) # 2. dnn self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.ivector('y') ####################### # build lstm layers # ####################### print '1. start to build lstm layer: '+ str(self.lstm_layers_number) for i in xrange(self.lstm_layers_number): if i == 0: input_size = self.n_ins input = self.x else: input_size = self.lstm_layers_sizes[i - 1] input = self.layers[-1].output lstm_layer = LSTMLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.lstm_layers_sizes[i]) print '\tbuild lstm layer: ' + str(input_size) +' x '+ str(lstm_layer.n_out) self.layers.append(lstm_layer) self.lstm_layers.append(lstm_layer) self.params.extend(lstm_layer.params) self.delta_params.extend(lstm_layer.delta_params) print '1. finish lstm layer: '+ str(self.layers[-1].n_out) ####################### # build dnnv layers # ####################### #print '2. start to build dnnv layer: '+ str(self.hidden_layers_number) #for i in xrange(self.hidden_layers_number): # if i == 0: # input_size = self.layers[-1].n_out # else: # input_size = self.hidden_layers_sizes[i - 1] # input = self.layers[-1].output # fc_layer = HiddenLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.hidden_layers_sizes[i], activation=self.activation) # print '\tbuild dnnv layer: ' + str(input_size) +' x '+ str(fc_layer.n_out) # self.layers.append(fc_layer) # self.fc_layers.append(fc_layer) # self.params.extend(fc_layer.params) # self.delta_params.extend(fc_layer.delta_params) #print '2. finish dnnv layer: '+ str(self.layers[-1].n_out) ####################### # build log layers # ####################### print '3. start to build log layer: 1' input_size = self.layers[-1].n_out input = self.layers[-1].output logLayer = LogisticRegression(input=input, n_in=input_size, n_out=self.n_outs) print '\tbuild final layer: ' + str(input_size) +' x '+ str(self.n_outs) self.layers.append(logLayer) self.params.extend(logLayer.params) self.delta_params.extend(logLayer.delta_params) print '3. finish log layer: '+ str(self.layers[-1].n_out) print 'Total layers: '+ str(len(self.layers)) sys.stdout.flush() self.finetune_cost = logLayer.negative_log_likelihood(self.y) self.errors = logLayer.errors(self.y)
class CNN_SAT(object): def __init__(self, numpy_rng, theano_rng=None, batch_size=256, n_outs=500, conv_layer_configs=[], hidden_layers_sizes=[500, 500], ivec_layers_sizes=[500, 500], conv_activation=T.nnet.sigmoid, full_activation=T.nnet.sigmoid, use_fast=False, update_part=[0, 1], ivec_dim=100): self.conv_layers = [] self.full_layers = [] self.ivec_layers = [] self.params = [] self.delta_params = [] if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') input_shape = conv_layer_configs[0]['input_shape'] n_ins = input_shape[-1] * input_shape[-2] * input_shape[-3] self.iv = self.x[:, n_ins:n_ins + ivec_dim] self.raw = self.x[:, 0:n_ins] self.conv_layer_num = len(conv_layer_configs) self.full_layer_num = len(hidden_layers_sizes) self.ivec_layer_num = len(ivec_layers_sizes) # construct the adaptation NN for i in xrange(self.ivec_layer_num): if i == 0: input_size = ivec_dim layer_input = self.iv else: input_size = ivec_layers_sizes[i - 1] layer_input = self.ivec_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=ivec_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.ivec_layers.append(sigmoid_layer) if 0 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) linear_func = lambda x: x sigmoid_layer = HiddenLayer(rng=numpy_rng, input=self.ivec_layers[-1].output, n_in=ivec_layers_sizes[-1], n_out=n_ins, activation=linear_func) self.ivec_layers.append(sigmoid_layer) if 0 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) for i in xrange(self.conv_layer_num): if i == 0: input = self.raw + self.ivec_layers[-1].output else: input = self.conv_layers[-1].output config = conv_layer_configs[i] conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input, input_shape=config['input_shape'], filter_shape=config['filter_shape'], poolsize=config['poolsize'], activation=conv_activation, flatten=config['flatten'], use_fast=use_fast) self.conv_layers.append(conv_layer) if 1 in update_part: self.params.extend(conv_layer.params) self.delta_params.extend(conv_layer.delta_params) self.conv_output_dim = config['output_shape'][1] * config[ 'output_shape'][2] * config['output_shape'][3] for i in xrange(self.full_layer_num): # construct the sigmoidal layer if i == 0: input_size = self.conv_output_dim layer_input = self.conv_layers[-1].output else: input_size = hidden_layers_sizes[i - 1] layer_input = self.full_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=full_activation) # add the layer to our list of layers self.full_layers.append(sigmoid_layer) if 1 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(input=self.full_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.full_layers.append(self.logLayer) if 1 in update_part: self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) def kl_divergence(self, p, p_hat): return p * T.log(p / p_hat) + (1 - p) * T.log((1 - p) / (1 - p_hat)) def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = updates = collections.OrderedDict() for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam * learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] train_fn = theano.function( inputs=[ index, theano.Param(learning_rate, default=0.0001), theano.Param(momentum, default=0.5) ], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }) valid_fn = theano.function( inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) return train_fn, valid_fn
class CNN_SAT(object): def __init__(self, numpy_rng, theano_rng=None, batch_size = 256, n_outs=500, conv_layer_configs = [], hidden_layers_sizes=[500, 500], ivec_layers_sizes=[500, 500], conv_activation = T.nnet.sigmoid, full_activation = T.nnet.sigmoid, use_fast = False, update_part = [0, 1], ivec_dim = 100): self.conv_layers = [] self.full_layers = [] self.ivec_layers = [] self.params = [] self.delta_params = [] if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') input_shape = conv_layer_configs[0]['input_shape'] n_ins = input_shape[-1] * input_shape[-2] * input_shape[-3] self.iv = self.x[:,n_ins:n_ins+ivec_dim] self.raw = self.x[:,0:n_ins] self.conv_layer_num = len(conv_layer_configs) self.full_layer_num = len(hidden_layers_sizes) self.ivec_layer_num = len(ivec_layers_sizes) # construct the adaptation NN for i in xrange(self.ivec_layer_num): if i == 0: input_size = ivec_dim layer_input = self.iv else: input_size = ivec_layers_sizes[i - 1] layer_input = self.ivec_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=ivec_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.ivec_layers.append(sigmoid_layer) if 0 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) linear_func = lambda x: x sigmoid_layer = HiddenLayer(rng=numpy_rng, input=self.ivec_layers[-1].output, n_in=ivec_layers_sizes[-1], n_out=n_ins, activation=linear_func) self.ivec_layers.append(sigmoid_layer) if 0 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) for i in xrange(self.conv_layer_num): if i == 0: input = self.raw + self.ivec_layers[-1].output else: input = self.conv_layers[-1].output config = conv_layer_configs[i] conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input, input_shape = config['input_shape'], filter_shape = config['filter_shape'], poolsize = config['poolsize'], activation = conv_activation, flatten = config['flatten'], use_fast = use_fast) self.conv_layers.append(conv_layer) if 1 in update_part: self.params.extend(conv_layer.params) self.delta_params.extend(conv_layer.delta_params) self.conv_output_dim = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3] for i in xrange(self.full_layer_num): # construct the sigmoidal layer if i == 0: input_size = self.conv_output_dim layer_input = self.conv_layers[-1].output else: input_size = hidden_layers_sizes[i - 1] layer_input = self.full_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=full_activation) # add the layer to our list of layers self.full_layers.append(sigmoid_layer) if 1 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.full_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.full_layers.append(self.logLayer) if 1 in update_part: self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) def kl_divergence(self, p, p_hat): return p * T.log(p / p_hat) + (1 - p) * T.log((1 - p) / (1 - p_hat)) def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = {} for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam*learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) valid_fn = theano.function(inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) return train_fn, valid_fn
def __init__(self, numpy_rng, theano_rng=None, batch_size = 256, n_outs=500, sparsity = None, sparsity_weight = None, sparse_layer = 3, conv_layer_configs = [], hidden_layers_sizes=[500, 500], conv_activation = T.nnet.sigmoid, full_activation = T.nnet.sigmoid, use_fast = False): self.layers = [] self.params = [] self.delta_params = [] self.sparsity = sparsity self.sparsity_weight = sparsity_weight self.sparse_layer = sparse_layer if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') self.conv_layer_num = len(conv_layer_configs) self.full_layer_num = len(hidden_layers_sizes) for i in xrange(self.conv_layer_num): if i == 0: input = self.x is_input_layer = True else: input = self.layers[-1].output is_input_layer = False config = conv_layer_configs[i] conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input, is_input_layer = is_input_layer, input_shape = config['input_shape'], filter_shape = config['filter_shape'], poolsize = config['poolsize'], activation = conv_activation, flatten = config['flatten']) self.layers.append(conv_layer) self.params.extend(conv_layer.params) self.delta_params.extend(conv_layer.delta_params) self.conv_output_dim = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3] for i in xrange(self.full_layer_num): # construct the sigmoidal layer if i == 0: input_size = self.conv_output_dim else: input_size = hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=full_activation) # add the layer to our list of layers self.layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) if self.sparsity_weight is not None: sparsity_level = T.extra_ops.repeat(self.sparsity, 630) avg_act = self.sigmoid_layers[sparse_layer].output.mean(axis=0) kl_div = self.kl_divergence(sparsity_level, avg_act) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) + self.sparsity_weight * kl_div.sum() else: self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y)
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, activation = T.nnet.sigmoid, do_maxout = False, pool_size = 1, do_pnorm = False, pnorm_order = 1, max_col_norm = None, l1_reg = None, l2_reg = None, ivec_layers_sizes=[500, 500], ivec_dim = 100): self.sigmoid_layers = [] self.ivec_layers = [] self.sigmoid_params = [] # params and delta_params for the DNN parameters; the sigmoid prefix is a bit confusing self.sigmoid_delta_params = [] self.ivec_params = [] # params and delta_params for the iVecNN parameters self.ivec_delta_params = [] self.params = [] # the params to be updated in the current training self.delta_params = [] self.n_layers = len(hidden_layers_sizes) self.ivec_layer_num = len(ivec_layers_sizes) self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') # we assume that i-vectors are appended to speech features in a frame-wise manner self.iv = self.x[:,n_ins:n_ins+ivec_dim] self.raw = self.x[:,0:n_ins] # construct the iVecNN which generates linear feature shifts for i in xrange(self.ivec_layer_num): if i == 0: input_size = ivec_dim layer_input = self.iv else: input_size = ivec_layers_sizes[i - 1] layer_input = self.ivec_layers[-1].output ivec_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=ivec_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.ivec_layers.append(ivec_layer) self.ivec_params.extend(ivec_layer.params) self.ivec_delta_params.extend(ivec_layer.delta_params) # the final output layer which has the same dimension as the input features linear_func = lambda x: x ivec_layer = HiddenLayer(rng=numpy_rng, input=self.ivec_layers[-1].output, n_in=ivec_layers_sizes[-1], n_out=n_ins, activation=linear_func) self.ivec_layers.append(ivec_layer) self.ivec_params.extend(ivec_layer.params) self.ivec_delta_params.extend(ivec_layer.delta_params) # construct the DNN (canonical model) for i in xrange(self.n_layers): if i == 0: input_size = n_ins layer_input = self.raw + self.ivec_layers[-1].output else: input_size = hidden_layers_sizes[i - 1] layer_input = self.sigmoid_layers[-1].output if do_maxout == True: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation = (lambda x: 1.0*x), do_maxout = True, pool_size = pool_size) elif do_pnorm == True: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation = (lambda x: 1.0*x), do_pnorm = True, pool_size = pool_size, pnorm_order = pnorm_order) else: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.sigmoid_params.extend(sigmoid_layer.params) self.sigmoid_delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.sigmoid_layers.append(self.logLayer) self.sigmoid_params.extend(self.logLayer.params) self.sigmoid_delta_params.extend(self.logLayer.delta_params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y)
def __init__( self, numpy_rng, theano_rng=None, cfg=None, # the network configuration dnn_shared=None, shared_layers=[], input=None, draw=None): self.cfg = cfg self.params = [] self.delta_params = [] self.n_ins = cfg.n_ins self.n_outs = cfg.n_outs self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg self.do_maxout = cfg.do_maxout self.pool_size = cfg.pool_size self.max_col_norm = 1 print self.max_col_norm self.layers = [] self.lstm_layers = [] self.fc_layers = [] # 1. lstm self.lstm_layers_sizes = cfg.lstm_layers_sizes self.lstm_layers_number = len(self.lstm_layers_sizes) # 2. dnn self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.ivector('y') ####################### # build lstm layers # ####################### print '1. start to build AttendLSTMLayer : ' + str( self.lstm_layers_number) + ', n_attendout: ' + str(cfg.batch_size) for i in xrange(1): if i == 0: input_size = self.n_ins input = self.x else: input_size = self.lstm_layers_sizes[i - 1] input = self.layers[-1].output lstm_layer = AttendLSTMLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.lstm_layers_sizes[i], steps=cfg.batch_size, draw=draw) print '\tbuild AttendLSTMLayer: ' + str(input_size) + ' x ' + str( lstm_layer.n_out) self.layers.append(lstm_layer) self.lstm_layers.append(lstm_layer) self.params.extend(lstm_layer.params) self.delta_params.extend(lstm_layer.delta_params) print '1. finish AttendLSTMLayer: ' + str(self.layers[-1].n_out) print '2. start to build LSTMLayer : ' + str(self.lstm_layers_number) for i in xrange(1, self.lstm_layers_number, 1): if i == 0: input_size = self.n_ins input = self.x else: input_size = self.lstm_layers_sizes[i - 1] input = self.layers[-1].output lstm_layer = LSTMLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.lstm_layers_sizes[i]) print '\tbuild LSTMLayer: ' + str(input_size) + ' x ' + str( lstm_layer.n_out) self.layers.append(lstm_layer) self.lstm_layers.append(lstm_layer) self.params.extend(lstm_layer.params) self.delta_params.extend(lstm_layer.delta_params) print '2. finish LSTMLayer: ' + str(self.layers[-1].n_out) ####################### # build dnnv layers # ####################### #print '2. start to build dnnv layer: '+ str(self.hidden_layers_number) #for i in xrange(self.hidden_layers_number): # if i == 0: # input_size = self.layers[-1].n_out # else: # input_size = self.hidden_layers_sizes[i - 1] # input = self.layers[-1].output # fc_layer = HiddenLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.hidden_layers_sizes[i], activation=self.activation) # print '\tbuild dnnv layer: ' + str(input_size) +' x '+ str(fc_layer.n_out) # self.layers.append(fc_layer) # self.fc_layers.append(fc_layer) # self.params.extend(fc_layer.params) # self.delta_params.extend(fc_layer.delta_params) #print '2. finish dnnv layer: '+ str(self.layers[-1].n_out) ####################### # build log layers # ####################### print '3. start to build log layer: 1' input_size = self.layers[-1].n_out input = self.layers[-1].output logLayer = LogisticRegression(input=input, n_in=input_size, n_out=self.n_outs) print '\tbuild final layer: ' + str(input_size) + ' x ' + str( self.n_outs) self.layers.append(logLayer) self.params.extend(logLayer.params) self.delta_params.extend(logLayer.delta_params) print '3. finish log layer: ' + str(self.layers[-1].n_out) print 'Total layers: ' + str(len(self.layers)) sys.stdout.flush() self.finetune_cost = logLayer.negative_log_likelihood(self.y) self.errors = logLayer.errors(self.y)
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1], pool_size=3, sparsity=None, sparsity_weight=None, first_reconstruct_activation=T.tanh): self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation=(lambda x: 1.0 * x), do_maxout=True, pool_size=pool_size) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this layer if i == 0: reconstruct_activation = first_reconstruct_activation else: reconstruct_activation = (lambda x: 1.0 * x) # reconstruct_activation = first_reconstruct_activation dA_layer = dA_maxout(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i] * pool_size, W=sigmoid_layer.W, bhid=sigmoid_layer.b, sparsity=sparsity, sparsity_weight=sparsity_weight, pool_size=pool_size, reconstruct_activation=reconstruct_activation) self.dA_layers.append(dA_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.sigmoid_layers.append(self.logLayer) self.params.extend(self.logLayer.params)
class DNN_SAT(object): def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, activation = T.nnet.sigmoid, do_maxout = False, pool_size = 1, do_pnorm = False, pnorm_order = 1, max_col_norm = None, l1_reg = None, l2_reg = None, ivec_layers_sizes=[500, 500], ivec_dim = 100): self.sigmoid_layers = [] self.ivec_layers = [] self.sigmoid_params = [] # params and delta_params for the DNN parameters; the sigmoid prefix is a bit confusing self.sigmoid_delta_params = [] self.ivec_params = [] # params and delta_params for the iVecNN parameters self.ivec_delta_params = [] self.params = [] # the params to be updated in the current training self.delta_params = [] self.n_layers = len(hidden_layers_sizes) self.ivec_layer_num = len(ivec_layers_sizes) self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') # we assume that i-vectors are appended to speech features in a frame-wise manner self.iv = self.x[:,n_ins:n_ins+ivec_dim] self.raw = self.x[:,0:n_ins] # construct the iVecNN which generates linear feature shifts for i in xrange(self.ivec_layer_num): if i == 0: input_size = ivec_dim layer_input = self.iv else: input_size = ivec_layers_sizes[i - 1] layer_input = self.ivec_layers[-1].output ivec_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=ivec_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.ivec_layers.append(ivec_layer) self.ivec_params.extend(ivec_layer.params) self.ivec_delta_params.extend(ivec_layer.delta_params) # the final output layer which has the same dimension as the input features linear_func = lambda x: x ivec_layer = HiddenLayer(rng=numpy_rng, input=self.ivec_layers[-1].output, n_in=ivec_layers_sizes[-1], n_out=n_ins, activation=linear_func) self.ivec_layers.append(ivec_layer) self.ivec_params.extend(ivec_layer.params) self.ivec_delta_params.extend(ivec_layer.delta_params) # construct the DNN (canonical model) for i in xrange(self.n_layers): if i == 0: input_size = n_ins layer_input = self.raw + self.ivec_layers[-1].output else: input_size = hidden_layers_sizes[i - 1] layer_input = self.sigmoid_layers[-1].output if do_maxout == True: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation = (lambda x: 1.0*x), do_maxout = True, pool_size = pool_size) elif do_pnorm == True: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation = (lambda x: 1.0*x), do_pnorm = True, pool_size = pool_size, pnorm_order = pnorm_order) else: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.sigmoid_params.extend(sigmoid_layer.params) self.sigmoid_delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.sigmoid_layers.append(self.logLayer) self.sigmoid_params.extend(self.logLayer.params) self.sigmoid_delta_params.extend(self.logLayer.delta_params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = {} for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam*learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) valid_fn = theano.function(inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) return train_fn, valid_fn
def __init__(self, numpy_rng, theano_rng=None, cfg=None, dnn_shared=None, shared_layers=[]): self.layers = [] self.dropout_layers = [] self.params = [] self.delta_params = [] self.cfg = cfg self.n_ins = cfg.n_ins self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout self.pool_size = cfg.pool_size self.input_dropout_factor = cfg.input_dropout_factor self.dropout_factor = cfg.dropout_factor self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.hidden_layers_number): # construct the hidden layer if i == 0: input_size = self.n_ins layer_input = self.x if self.input_dropout_factor > 0.0: dropout_layer_input = _dropout_from_layer( theano_rng, self.x, self.input_dropout_factor) else: dropout_layer_input = self.x else: input_size = self.hidden_layers_sizes[i - 1] layer_input = ( 1 - self.dropout_factor[i - 1]) * self.layers[-1].output dropout_layer_input = self.dropout_layers[-1].dropout_output W = None b = None if (i in shared_layers): W = dnn_shared.layers[i].W b = dnn_shared.layers[i].b if self.do_maxout == False: dropout_layer = DropoutHiddenLayer( rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W=W, b=b, activation=self.activation, dropout_factor=self.dropout_factor[i]) hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], activation=self.activation, W=dropout_layer.W, b=dropout_layer.b) else: dropout_layer = DropoutHiddenLayer( rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, W=W, b=b, activation=(lambda x: 1.0 * x), dropout_factor=self.dropout_factor[i], do_maxout=True, pool_size=self.pool_size) hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, activation=(lambda x: 1.0 * x), W=dropout_layer.W, b=dropout_layer.b, do_maxout=True, pool_size=self.pool_size) # add the layer to our list of layers self.layers.append(hidden_layer) self.dropout_layers.append(dropout_layer) self.params.extend(dropout_layer.params) self.delta_params.extend(dropout_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.dropout_logLayer = LogisticRegression( input=self.dropout_layers[-1].dropout_output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) self.logLayer = LogisticRegression( input=(1 - self.dropout_factor[-1]) * self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs, W=self.dropout_logLayer.W, b=self.dropout_logLayer.b) self.dropout_layers.append(self.dropout_logLayer) self.layers.append(self.logLayer) self.params.extend(self.dropout_logLayer.params) self.delta_params.extend(self.dropout_logLayer.delta_params) # compute the cost self.finetune_cost = self.dropout_logLayer.negative_log_likelihood( self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum()
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1],activation=T.nnet.sigmoid): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ super(SDA, self).__init__() self.layers = [] self.dA_layers = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b, activation=T.nnet.sigmoid) self.dA_layers.append(dA_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.output = self.logLayer.prediction(); self.features = self.layers[-2].output; self.features_dim = self.layers[-2].n_out
class SDA(nnet): """Stacked denoising auto-encoder class (SdA) A stacked denoising autoencoder model is obtained by stacking several dAs. The hidden layer of the dA at layer `i` becomes the input of the dA at layer `i+1`. The first layer dA gets as input the input of the SdA, and the hidden layer of the last dA represents the output. Note that after pretraining, the SdA is dealt with as a normal MLP, the dAs are only used to initialize the weights. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1],activation=T.nnet.sigmoid): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ super(SDA, self).__init__() self.layers = [] self.dA_layers = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b, activation=T.nnet.sigmoid) self.dA_layers.append(dA_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.output = self.logLayer.prediction(); self.features = self.layers[-2].output; self.features_dim = self.layers[-2].n_out def pretraining_functions(self, train_x, batch_size): ''' Generates a list of functions, each of them implementing one step in trainnig the dA corresponding to the layer with same index. The function will require as input the minibatch index, and to train a dA you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_x: theano.tensor.TensorType :param train_x: Shared variable that contains all datapoints used for training the dA :type batch_size: int :param batch_size: size of a [mini]batch ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch corruption_level = T.scalar('corruption') # % of corruption to use learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for dA in self.dA_layers: # get the cost and the updates list cost, updates = dA.get_cost_updates(corruption_level, learning_rate) # compile the theano function fn = theano.function(inputs=[index, theano.Param(corruption_level, default=0.2), theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={self.x: train_x[batch_begin: batch_end]}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns
class DNN(object): def __init__(self, numpy_rng, theano_rng=None, cfg = None, # the network configuration dnn_shared = None, shared_layers=[], input = None): self.layers = [] self.params = [] self.delta_params = [] self.rnn_layerX = 2 print "Use DRN" self.cfg = cfg self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.ivector('y') for i in xrange(self.hidden_layers_number): # construct the hidden layer if i == 0: input_size = self.n_ins layer_input = self.x else: input_size = self.hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output W = None; b = None if (i in shared_layers) : W = dnn_shared.layers[i].W; b = dnn_shared.layers[i].b if i == self.rnn_layerX: hidden_layer = RnnLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W = W, b = b, activation=self.activation) else: if self.do_maxout == True: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, W = W, b = b, activation = (lambda x: 1.0*x), do_maxout = True, pool_size = self.pool_size) else: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W = W, b = b, activation=self.activation) # add the layer to our list of layers self.layers.append(hidden_layer) self.params.extend(hidden_layer.params) self.delta_params.extend(hidden_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) if self.n_outs > 0: self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum() def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = collections.OrderedDict() for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam*learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] if self.max_col_norm is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) valid_fn = theano.function(inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) return train_fn, valid_fn def build_extract_feat_function(self, output_layer): feat = T.matrix('feat') out_da = theano.function([feat], self.layers[output_layer].output, updates = None, givens={self.x:feat}, on_unused_input='warn') return out_da def build_finetune_functions_kaldi(self, train_shared_xy, valid_shared_xy): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = collections.OrderedDict() for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam*learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] if self.max_col_norm is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) train_fn = theano.function(inputs=[theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, givens={self.x: train_set_x, self.y: train_set_y}) valid_fn = theano.function(inputs=[], outputs=self.errors, givens={self.x: valid_set_x, self.y: valid_set_y}) return train_fn, valid_fn def write_model_to_raw(self, file_path): # output the model to tmp_path; this format is readable by PDNN _nnet2file(self.layers, filename=file_path) def write_model_to_kaldi(self, file_path, with_softmax = True): # determine whether it's BNF based on layer sizes output_layer_number = -1; for layer_index in range(1, self.hidden_layers_number - 1): cur_layer_size = self.hidden_layers_sizes[layer_index] prev_layer_size = self.hidden_layers_sizes[layer_index-1] next_layer_size = self.hidden_layers_sizes[layer_index+1] if cur_layer_size < prev_layer_size and cur_layer_size < next_layer_size: output_layer_number = layer_index+1; break layer_number = len(self.layers) if output_layer_number == -1: output_layer_number = layer_number fout = open(file_path, 'wb') for i in xrange(output_layer_number): activation_text = '<' + self.cfg.activation_text + '>' if i == (layer_number-1) and with_softmax: # we assume that the last layer is a softmax layer activation_text = '<softmax>' W_mat = self.layers[i].W.get_value() b_vec = self.layers[i].b.get_value() input_size, output_size = W_mat.shape W_layer = []; b_layer = '' for rowX in xrange(output_size): W_layer.append('') for x in xrange(input_size): for t in xrange(output_size): W_layer[t] = W_layer[t] + str(W_mat[x][t]) + ' ' for x in xrange(output_size): b_layer = b_layer + str(b_vec[x]) + ' ' fout.write('<affinetransform> ' + str(output_size) + ' ' + str(input_size) + '\n') fout.write('[' + '\n') for x in xrange(output_size): fout.write(W_layer[x].strip() + '\n') fout.write(']' + '\n') fout.write('[ ' + b_layer.strip() + ' ]' + '\n') if activation_text == '<maxout>': fout.write(activation_text + ' ' + str(output_size/self.pool_size) + ' ' + str(output_size) + '\n') else: fout.write(activation_text + ' ' + str(output_size) + ' ' + str(output_size) + '\n') fout.close()
def __init__( self, task_id, numpy_rng, theano_rng=None, cfg=None, # the network configuration dnn_shared=None, shared_layers=[], input=None): self.layers = [] self.params = [] self.delta_params = [] self.cfg = cfg self.n_ins = cfg.n_ins self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout self.pool_size = cfg.pool_size self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg self.non_updated_layers = cfg.non_updated_layers if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data if input == None: self.x = T.matrix('x') else: self.x = input if task_id == 0: self.y = T.ivector('y') else: self.y = T.matrix('y') ####################### # build dnnv layers # ####################### print "==============" print "Task ID: %d" % (task_id) print "==============" print '1. start to build dnn layer: ' + str(self.hidden_layers_number) for i in xrange(self.hidden_layers_number): if i == 0: input_size = self.n_ins input = self.x else: input_size = self.hidden_layers_sizes[i - 1] input = self.layers[-1].output W = None b = None if (i in shared_layers): print "shared layer = %d" % (i) W = dnn_shared.layers[i].W b = dnn_shared.layers[i].b hidden_layer = HiddenLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W=W, b=b, activation=self.activation) print '\tbuild lstm layer: ' + str(input_size) + ' x ' + str( hidden_layer.n_out) self.layers.append(hidden_layer) self.params.extend(hidden_layer.params) self.delta_params.extend(hidden_layer.delta_params) print '1. finish dnnv layer: ' + str(self.layers[-1].n_out) ####################### # build log layers # ####################### print '2. start to build final layer: 1' input_size = self.layers[-1].n_out input = self.layers[-1].output if task_id == 0: self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) print '\tbuild final layer (classification): ' + str( input_size) + ' x ' + str(self.logLayer.n_out) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) else: self.logLayer = OutputLayer(input=input, n_in=input_size, n_out=self.n_outs) print '\tbuild final layer (regression): ' + str( input_size) + ' x ' + str(self.logLayer.n_out) self.finetune_cost = self.logLayer.l2(self.y) self.errors = self.logLayer.errors(self.y) self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) print '2. finish log layer: ' + str(self.layers[-1].n_out) print 'Total layers: ' + str(len(self.layers)) sys.stdout.flush() if self.l2_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum()
class DNN_Dropout(object): def __init__(self, numpy_rng, theano_rng=None, cfg = None, dnn_shared = None, shared_layers=[]): self.layers = [] self.dropout_layers = [] self.params = [] self.delta_params = [] self.cfg = cfg self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size self.input_dropout_factor = cfg.input_dropout_factor; self.dropout_factor = cfg.dropout_factor self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in range(self.hidden_layers_number): # construct the hidden layer if i == 0: input_size = self.n_ins layer_input = self.x if self.input_dropout_factor > 0.0: dropout_layer_input = _dropout_from_layer(theano_rng, self.x, self.input_dropout_factor) else: dropout_layer_input = self.x else: input_size = self.hidden_layers_sizes[i - 1] layer_input = (1 - self.dropout_factor[i - 1]) * self.layers[-1].output dropout_layer_input = self.dropout_layers[-1].dropout_output W = None; b = None if (i in shared_layers) : W = dnn_shared.layers[i].W; b = dnn_shared.layers[i].b if self.do_maxout == False: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W = W, b = b, activation= self.activation, dropout_factor=self.dropout_factor[i]) hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], activation= self.activation, W=dropout_layer.W, b=dropout_layer.b) else: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, W = W, b = b, activation= (lambda x: 1.0*x), dropout_factor=self.dropout_factor[i], do_maxout = True, pool_size = self.pool_size) hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, activation= (lambda x: 1.0*x), W=dropout_layer.W, b=dropout_layer.b, do_maxout = True, pool_size = self.pool_size) # add the layer to our list of layers self.layers.append(hidden_layer) self.dropout_layers.append(dropout_layer) self.params.extend(dropout_layer.params) self.delta_params.extend(dropout_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.dropout_logLayer = LogisticRegression( input=self.dropout_layers[-1].dropout_output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) self.logLayer = LogisticRegression( input=(1 - self.dropout_factor[-1]) * self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs, W=self.dropout_logLayer.W, b=self.dropout_logLayer.b) self.dropout_layers.append(self.dropout_logLayer) self.layers.append(self.logLayer) self.params.extend(self.dropout_logLayer.params) self.delta_params.extend(self.dropout_logLayer.delta_params) # compute the cost self.finetune_cost = self.dropout_logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in range(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in range(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum() def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch updates = self.cfg.lrate.getOptimizerUpdates(self.finetune_cost,self.delta_params,self.params) for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] if self.max_col_norm is not None: for i in range(self.hidden_layers_number): W = self.layers[i].W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) train_fn = theano.function(inputs=[index], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) valid_fn = theano.function(inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) return train_fn, valid_fn def write_model_to_raw(self, file_path): # output the model to tmp_path; this format is readable by PDNN _nnet2file(self.layers, filename=file_path, input_factor = self.input_dropout_factor, factor = self.dropout_factor) def write_model_to_kaldi(self, file_path, with_softmax = True): # determine whether it's BNF based on layer sizes output_layer_number = -1; for layer_index in range(1, self.hidden_layers_number - 1): cur_layer_size = self.hidden_layers_sizes[layer_index] prev_layer_size = self.hidden_layers_sizes[layer_index-1] next_layer_size = self.hidden_layers_sizes[layer_index+1] if cur_layer_size < prev_layer_size and cur_layer_size < next_layer_size: output_layer_number = layer_index+1; break layer_number = len(self.layers) if output_layer_number == -1: output_layer_number = layer_number fout = smart_open(file_path, 'wb') for i in range(output_layer_number): # decide the dropout factor for this layer dropout_factor = 0.0 if i == 0: dropout_factor = self.input_dropout_factor if i > 0 and len(self.dropout_factor) > 0: dropout_factor = self.dropout_factor[i-1] activation_text = '<' + self.cfg.activation_text + '>' if i == (layer_number-1) and with_softmax: # we assume that the last layer is a softmax layer activation_text = '<softmax>' W_mat = (1.0 - dropout_factor) * self.layers[i].W.get_value() b_vec = self.layers[i].b.get_value() input_size, output_size = W_mat.shape W_layer = []; b_layer = '' for rowX in range(output_size): W_layer.append('') for x in range(input_size): for t in range(output_size): W_layer[t] = W_layer[t] + str(W_mat[x][t]) + ' ' for x in range(output_size): b_layer = b_layer + str(b_vec[x]) + ' ' fout.write('<affinetransform> ' + str(output_size) + ' ' + str(input_size) + '\n') fout.write('[' + '\n') for x in range(output_size): fout.write(W_layer[x].strip() + '\n') fout.write(']' + '\n') fout.write('[ ' + b_layer.strip() + ' ]' + '\n') if activation_text == '<maxout>': fout.write(activation_text + ' ' + str(output_size/self.pool_size) + ' ' + str(output_size) + '\n') else: fout.write(activation_text + ' ' + str(output_size) + ' ' + str(output_size) + '\n') fout.close()
class DNN_Dropout(nnet): def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, activation = T.nnet.sigmoid, input_dropout_factor = 0, dropout_factor = [0.2,0.2,0.2,0.2,0.2,0.2,0.2], adv_activation = None, max_col_norm = None, l1_reg = None, l2_reg = None): super(DNN_Dropout, self).__init__() self.layers = [] self.dropout_layers = [] self.n_layers = len(hidden_layers_sizes) self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg self.input_dropout_factor = input_dropout_factor self.dropout_factor = dropout_factor assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): # construct the sigmoidal layer if i == 0: input_size = n_ins layer_input = self.x if input_dropout_factor > 0.0: dropout_layer_input = _dropout_from_layer(theano_rng, self.x, input_dropout_factor) else: dropout_layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = (1 - self.dropout_factor[i - 1]) * self.layers[-1].output dropout_layer_input = self.dropout_layers[-1].dropout_output if not adv_activation is None: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * adv_activation['pool_size'], activation= activation, adv_activation_method = adv_activation['method'], pool_size = adv_activation['pool_size'], pnorm_order = adv_activation['pnorm_order'], dropout_factor=self.dropout_factor[i]) sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * adv_activation['pool_size'], activation=activation, adv_activation_method = adv_activation['method'], pool_size = adv_activation['pool_size'], pnorm_order = adv_activation['pnorm_order'], W=dropout_layer.W, b=dropout_layer.b) else: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation= activation, dropout_factor=self.dropout_factor[i]) sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] , activation= activation, W=dropout_layer.W, b=dropout_layer.b) # add the layer to our list of layers self.layers.append(sigmoid_layer) self.dropout_layers.append(dropout_layer) self.params.extend(dropout_layer.params) self.delta_params.extend(dropout_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.dropout_logLayer = LogisticRegression( input=self.dropout_layers[-1].dropout_output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.logLayer = LogisticRegression( input=(1 - self.dropout_factor[-1]) * self.layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs, W=self.dropout_logLayer.W, b=self.dropout_logLayer.b) self.dropout_layers.append(self.dropout_logLayer) self.layers.append(self.logLayer) self.params.extend(self.dropout_logLayer.params) self.delta_params.extend(self.dropout_logLayer.delta_params) # compute the cost self.finetune_cost = self.dropout_logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) self.output = self.logLayer.prediction(); self.features = self.layers[-2].output; self.features_dim = self.layers[-2].n_out if self.l1_reg is not None: self.__l1Regularization__(); if self.l2_reg is not None: self.__l2Regularization__(); def save(self,filename,start_layer = 0,max_layer_num = -1,withfinal=True): nnet_dict = {} if max_layer_num == -1: max_layer_num = self.n_layers for i in range(start_layer, max_layer_num): dict_a = str(i) + ' W' if i == 0: nnet_dict[dict_a] = _array2string((1.0 - self.input_dropout_factor) * ( self.layers[i].params[0].get_value())) else: nnet_dict[dict_a] = _array2string((1.0 - self.dropout_factor[i - 1])* ( self.layers[i].params[0].get_value())) dict_a = str(i) + ' b' nnet_dict[dict_a] = _array2string(self.layers[i].params[1].get_value()) if withfinal: dict_a = 'logreg W' nnet_dict[dict_a] = _array2string((1.0 - self.dropout_factor[-1])* ( self.logLayer.params[0].get_value())) dict_a = 'logreg b' nnet_dict[dict_a] = _array2string(self.logLayer.params[1].get_value()) with open(filename, 'wb') as fp: json.dump(nnet_dict, fp, indent=2, sort_keys = True) fp.flush() def load(self,filename,start_layer = 0,max_layer_num = -1,withfinal=True): nnet_dict = {} if max_layer_num == -1: max_layer_num = self.n_layers with open(filename, 'rb') as fp: nnet_dict = json.load(fp) for i in xrange(max_layer_num): dict_key = str(i) + ' W' self.layers[i].params[0].set_value(numpy.asarray(_string2array(nnet_dict[dict_key]), dtype=theano.config.floatX)) dict_key = str(i) + ' b' self.layers[i].params[1].set_value(numpy.asarray(_string2array(nnet_dict[dict_key]), dtype=theano.config.floatX)) if withfinal: dict_key = 'logreg W' self.logLayer.params[0].set_value(numpy.asarray(_string2array(nnet_dict[dict_key]), dtype=theano.config.floatX)) dict_key = 'logreg b' self.logLayer.params[1].set_value(numpy.asarray(_string2array(nnet_dict[dict_key]), dtype=theano.config.floatX))
class DNN_2Tower(object): def __init__(self, numpy_rng, theano_rng=None, upper_hidden_layers_sizes=[500, 500], n_outs=10, tower1_hidden_layers_sizes=[500, 500], tower1_n_ins = 100, tower2_hidden_layers_sizes=[500, 500], tower2_n_ins = 100, activation = T.nnet.sigmoid, do_maxout = False, pool_size = 1, do_pnorm = False, pnorm_order = 1, max_col_norm = None, l1_reg = None, l2_reg = None): self.tower1_layers = [] self.tower2_layers = [] self.upper_layers = [] self.params = [] self.delta_params = [] self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') self.tower1_input = self.x[:,0:tower1_n_ins] self.tower2_input = self.x[:,tower1_n_ins:(tower1_n_ins + tower2_n_ins)] # build tower1 for i in xrange(len(tower1_hidden_layers_sizes)): if i == 0: input_size = tower1_n_ins layer_input = self.tower1_input else: input_size = tower1_hidden_layers_sizes[i - 1] layer_input = self.tower1_layers[-1].output layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=tower1_hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.tower1_layers.append(layer) self.params.extend(layer.params) self.delta_params.extend(layer.delta_params) # build tower2 for i in xrange(len(tower2_hidden_layers_sizes)): if i == 0: input_size = tower2_n_ins layer_input = self.tower2_input else: input_size = tower2_hidden_layers_sizes[i - 1] layer_input = self.tower2_layers[-1].output layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=tower2_hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.tower2_layers.append(layer) self.params.extend(layer.params) self.delta_params.extend(layer.delta_params) for i in xrange(len(upper_hidden_layers_sizes)): # construct the sigmoidal layer if i == 0: input_size = tower1_hidden_layers_sizes[-1] + tower2_hidden_layers_sizes[-1] layer_input = T.concatenate([self.tower1_layers[-1].output, self.tower2_layers[-1].output], axis=1) else: input_size = upper_hidden_layers_sizes[i - 1] layer_input = self.upper_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=upper_hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.upper_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.upper_layers[-1].output, n_in=upper_hidden_layers_sizes[-1], n_out=n_outs) self.upper_layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) # if self.l1_reg is not None: # for i in xrange(self.n_layers): # W = self.params[i * 2] # self.finetune_cost += self.l1_reg * (abs(W).sum()) # if self.l2_reg is not None: # for i in xrange(self.n_layers): # W = self.params[i * 2] # self.finetune_cost += self.l2_reg * T.sqr(W).sum() def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = collections.OrderedDict() for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam*learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] if self.max_col_norm is not None: for i in xrange(self.n_layers): W = self.params[i * 2] if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) valid_fn = theano.function(inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) return train_fn, valid_fn
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, activation = T.nnet.sigmoid, input_dropout_factor = 0, dropout_factor = [0.2,0.2,0.2,0.2,0.2,0.2,0.2], adv_activation = None, max_col_norm = None, l1_reg = None, l2_reg = None): super(DNN_Dropout, self).__init__() self.layers = [] self.dropout_layers = [] self.n_layers = len(hidden_layers_sizes) self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg self.input_dropout_factor = input_dropout_factor self.dropout_factor = dropout_factor assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): # construct the sigmoidal layer if i == 0: input_size = n_ins layer_input = self.x if input_dropout_factor > 0.0: dropout_layer_input = _dropout_from_layer(theano_rng, self.x, input_dropout_factor) else: dropout_layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = (1 - self.dropout_factor[i - 1]) * self.layers[-1].output dropout_layer_input = self.dropout_layers[-1].dropout_output if not adv_activation is None: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * adv_activation['pool_size'], activation= activation, adv_activation_method = adv_activation['method'], pool_size = adv_activation['pool_size'], pnorm_order = adv_activation['pnorm_order'], dropout_factor=self.dropout_factor[i]) sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * adv_activation['pool_size'], activation=activation, adv_activation_method = adv_activation['method'], pool_size = adv_activation['pool_size'], pnorm_order = adv_activation['pnorm_order'], W=dropout_layer.W, b=dropout_layer.b) else: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation= activation, dropout_factor=self.dropout_factor[i]) sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] , activation= activation, W=dropout_layer.W, b=dropout_layer.b) # add the layer to our list of layers self.layers.append(sigmoid_layer) self.dropout_layers.append(dropout_layer) self.params.extend(dropout_layer.params) self.delta_params.extend(dropout_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.dropout_logLayer = LogisticRegression( input=self.dropout_layers[-1].dropout_output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.logLayer = LogisticRegression( input=(1 - self.dropout_factor[-1]) * self.layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs, W=self.dropout_logLayer.W, b=self.dropout_logLayer.b) self.dropout_layers.append(self.dropout_logLayer) self.layers.append(self.logLayer) self.params.extend(self.dropout_logLayer.params) self.delta_params.extend(self.dropout_logLayer.delta_params) # compute the cost self.finetune_cost = self.dropout_logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) self.output = self.logLayer.prediction(); self.features = self.layers[-2].output; self.features_dim = self.layers[-2].n_out if self.l1_reg is not None: self.__l1Regularization__(); if self.l2_reg is not None: self.__l2Regularization__();
class CNN(object): def __init__(self, numpy_rng, theano_rng=None, batch_size = 256, n_outs=500, sparsity = None, sparsity_weight = None, sparse_layer = 3, conv_layer_configs = [], hidden_layers_sizes=[500, 500], conv_activation = T.nnet.sigmoid, full_activation = T.nnet.sigmoid, use_fast = False): self.layers = [] self.params = [] self.delta_params = [] self.sparsity = sparsity self.sparsity_weight = sparsity_weight self.sparse_layer = sparse_layer if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') self.conv_layer_num = len(conv_layer_configs) self.full_layer_num = len(hidden_layers_sizes) for i in xrange(self.conv_layer_num): if i == 0: input = self.x is_input_layer = True else: input = self.layers[-1].output is_input_layer = False config = conv_layer_configs[i] conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input, is_input_layer = is_input_layer, input_shape = config['input_shape'], filter_shape = config['filter_shape'], poolsize = config['poolsize'], activation = conv_activation, flatten = config['flatten']) self.layers.append(conv_layer) self.params.extend(conv_layer.params) self.delta_params.extend(conv_layer.delta_params) self.conv_output_dim = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3] for i in xrange(self.full_layer_num): # construct the sigmoidal layer if i == 0: input_size = self.conv_output_dim else: input_size = hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=full_activation) # add the layer to our list of layers self.layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) if self.sparsity_weight is not None: sparsity_level = T.extra_ops.repeat(self.sparsity, 630) avg_act = self.sigmoid_layers[sparse_layer].output.mean(axis=0) kl_div = self.kl_divergence(sparsity_level, avg_act) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) + self.sparsity_weight * kl_div.sum() else: self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) def kl_divergence(self, p, p_hat): return p * T.log(p / p_hat) + (1 - p) * T.log((1 - p) / (1 - p_hat)) def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = {} for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam*learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) valid_fn = theano.function(inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) return train_fn, valid_fn
class DNN_2Tower(object): def __init__(self, numpy_rng, theano_rng=None, upper_hidden_layers_sizes=[500, 500], n_outs=10, tower1_hidden_layers_sizes=[500, 500], tower1_n_ins=100, tower2_hidden_layers_sizes=[500, 500], tower2_n_ins=100, activation=T.nnet.sigmoid, do_maxout=False, pool_size=1, do_pnorm=False, pnorm_order=1, max_col_norm=None, l1_reg=None, l2_reg=None): self.tower1_layers = [] self.tower2_layers = [] self.upper_layers = [] self.params = [] self.delta_params = [] self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') self.tower1_input = self.x[:, 0:tower1_n_ins] self.tower2_input = self.x[:, tower1_n_ins:(tower1_n_ins + tower2_n_ins)] # build tower1 for i in xrange(len(tower1_hidden_layers_sizes)): if i == 0: input_size = tower1_n_ins layer_input = self.tower1_input else: input_size = tower1_hidden_layers_sizes[i - 1] layer_input = self.tower1_layers[-1].output layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=tower1_hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.tower1_layers.append(layer) self.params.extend(layer.params) self.delta_params.extend(layer.delta_params) # build tower2 for i in xrange(len(tower2_hidden_layers_sizes)): if i == 0: input_size = tower2_n_ins layer_input = self.tower2_input else: input_size = tower2_hidden_layers_sizes[i - 1] layer_input = self.tower2_layers[-1].output layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=tower2_hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.tower2_layers.append(layer) self.params.extend(layer.params) self.delta_params.extend(layer.delta_params) for i in xrange(len(upper_hidden_layers_sizes)): # construct the sigmoidal layer if i == 0: input_size = tower1_hidden_layers_sizes[ -1] + tower2_hidden_layers_sizes[-1] layer_input = T.concatenate([ self.tower1_layers[-1].output, self.tower2_layers[-1].output ], axis=1) else: input_size = upper_hidden_layers_sizes[i - 1] layer_input = self.upper_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=upper_hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.upper_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(input=self.upper_layers[-1].output, n_in=upper_hidden_layers_sizes[-1], n_out=n_outs) self.upper_layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) # if self.l1_reg is not None: # for i in xrange(self.n_layers): # W = self.params[i * 2] # self.finetune_cost += self.l1_reg * (abs(W).sum()) # if self.l2_reg is not None: # for i in xrange(self.n_layers): # W = self.params[i * 2] # self.finetune_cost += self.l2_reg * T.sqr(W).sum() def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = collections.OrderedDict() for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam * learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] if self.max_col_norm is not None: for i in xrange(self.n_layers): W = self.params[i * 2] if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) train_fn = theano.function( inputs=[ index, theano.Param(learning_rate, default=0.0001), theano.Param(momentum, default=0.5) ], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }) valid_fn = theano.function( inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) return train_fn, valid_fn
def __init__(self, numpy_rng, theano_rng=None, cfg = None, # the network configuration dnn_shared = None, shared_layers=[], input = None, draw=None): self.cfg = cfg self.params = [] self.delta_params = [] self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size self.max_col_norm = 1 print self.max_col_norm self.layers = [] self.lstm_layers = [] self.fc_layers = [] self.bilayers = [] # 1. lstm self.lstm_layers_sizes = cfg.lstm_layers_sizes self.lstm_layers_number = len(self.lstm_layers_sizes) # 2. dnn self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.ivector('y') ####################### # build lstm layers # ####################### print '1. start to build AttendLSTMLayer : '+ str(self.lstm_layers_number) + ', n_attendout: '+ str(cfg.batch_size) for i in xrange(1): if i == 0: input_size = self.n_ins input = self.x else: input_size = self.lstm_layers_sizes[i - 1] input = self.bilayers[-1].output # Forward f_lstm_layer = AttendLSTMLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.lstm_layers_sizes[i], steps=cfg.batch_size, draw=draw) print '\tbuild f_lstm layer: ' + str(input_size) +' x '+ str(f_lstm_layer.n_out) self.layers.append(f_lstm_layer) self.lstm_layers.append(f_lstm_layer) self.params.extend(f_lstm_layer.params) self.delta_params.extend(f_lstm_layer.delta_params) # Backward b_lstm_layer = AttendLSTMLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.lstm_layers_sizes[i], backwards=True, steps=cfg.batch_size, draw=draw) print '\tbuild b_lstm layer: ' + str(input_size) +' x '+ str(b_lstm_layer.n_out) self.layers.append(b_lstm_layer) self.lstm_layers.append(b_lstm_layer) self.params.extend(b_lstm_layer.params) self.delta_params.extend(b_lstm_layer.delta_params) # Sum forward + backward bi_layer = SUMLayer(finput=f_lstm_layer.output,binput=b_lstm_layer.output[::-1], n_out=self.lstm_layers_sizes[i - 1]) self.bilayers.append(bi_layer) print '\tbuild sum layer: ' + str(input_size) +' x '+ str(bi_layer.n_out) print '1. finish AttendLSTMLayer: '+ str(self.bilayers[-1].n_out) ####################### # build log layers # ####################### print '3. start to build log layer: 1' input_size = self.bilayers[-1].n_out input = self.bilayers[-1].output logLayer = LogisticRegression(input=input, n_in=input_size, n_out=self.n_outs) print '\tbuild final layer: ' + str(input_size) +' x '+ str(self.n_outs) self.layers.append(logLayer) self.params.extend(logLayer.params) self.delta_params.extend(logLayer.delta_params) print '3. finish log layer: '+ str(self.bilayers[-1].n_out) print 'Total layers: '+ str(len(self.layers)) sys.stdout.flush() self.finetune_cost = logLayer.negative_log_likelihood(self.y) self.errors = logLayer.errors(self.y)
def __init__(self, numpy_rng, theano_rng=None, upper_hidden_layers_sizes=[500, 500], n_outs=10, tower1_hidden_layers_sizes=[500, 500], tower1_n_ins=100, tower2_hidden_layers_sizes=[500, 500], tower2_n_ins=100, activation=T.nnet.sigmoid, do_maxout=False, pool_size=1, do_pnorm=False, pnorm_order=1, max_col_norm=None, l1_reg=None, l2_reg=None): self.tower1_layers = [] self.tower2_layers = [] self.upper_layers = [] self.params = [] self.delta_params = [] self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') self.tower1_input = self.x[:, 0:tower1_n_ins] self.tower2_input = self.x[:, tower1_n_ins:(tower1_n_ins + tower2_n_ins)] # build tower1 for i in xrange(len(tower1_hidden_layers_sizes)): if i == 0: input_size = tower1_n_ins layer_input = self.tower1_input else: input_size = tower1_hidden_layers_sizes[i - 1] layer_input = self.tower1_layers[-1].output layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=tower1_hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.tower1_layers.append(layer) self.params.extend(layer.params) self.delta_params.extend(layer.delta_params) # build tower2 for i in xrange(len(tower2_hidden_layers_sizes)): if i == 0: input_size = tower2_n_ins layer_input = self.tower2_input else: input_size = tower2_hidden_layers_sizes[i - 1] layer_input = self.tower2_layers[-1].output layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=tower2_hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.tower2_layers.append(layer) self.params.extend(layer.params) self.delta_params.extend(layer.delta_params) for i in xrange(len(upper_hidden_layers_sizes)): # construct the sigmoidal layer if i == 0: input_size = tower1_hidden_layers_sizes[ -1] + tower2_hidden_layers_sizes[-1] layer_input = T.concatenate([ self.tower1_layers[-1].output, self.tower2_layers[-1].output ], axis=1) else: input_size = upper_hidden_layers_sizes[i - 1] layer_input = self.upper_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=upper_hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.upper_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(input=self.upper_layers[-1].output, n_in=upper_hidden_layers_sizes[-1], n_out=n_outs) self.upper_layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y)
def __init__(self, numpy_rng, theano_rng=None, batch_size=256, n_outs=500, conv_layer_configs=[], hidden_layers_sizes=[500, 500], ivec_layers_sizes=[500, 500], conv_activation=T.nnet.sigmoid, full_activation=T.nnet.sigmoid, use_fast=False, update_part=[0, 1], ivec_dim=100): self.conv_layers = [] self.full_layers = [] self.ivec_layers = [] self.params = [] self.delta_params = [] if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') input_shape = conv_layer_configs[0]['input_shape'] n_ins = input_shape[-1] * input_shape[-2] * input_shape[-3] self.iv = self.x[:, n_ins:n_ins + ivec_dim] self.raw = self.x[:, 0:n_ins] self.conv_layer_num = len(conv_layer_configs) self.full_layer_num = len(hidden_layers_sizes) self.ivec_layer_num = len(ivec_layers_sizes) # construct the adaptation NN for i in xrange(self.ivec_layer_num): if i == 0: input_size = ivec_dim layer_input = self.iv else: input_size = ivec_layers_sizes[i - 1] layer_input = self.ivec_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=ivec_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.ivec_layers.append(sigmoid_layer) if 0 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) linear_func = lambda x: x sigmoid_layer = HiddenLayer(rng=numpy_rng, input=self.ivec_layers[-1].output, n_in=ivec_layers_sizes[-1], n_out=n_ins, activation=linear_func) self.ivec_layers.append(sigmoid_layer) if 0 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) for i in xrange(self.conv_layer_num): if i == 0: input = self.raw + self.ivec_layers[-1].output else: input = self.conv_layers[-1].output config = conv_layer_configs[i] conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input, input_shape=config['input_shape'], filter_shape=config['filter_shape'], poolsize=config['poolsize'], activation=conv_activation, flatten=config['flatten'], use_fast=use_fast) self.conv_layers.append(conv_layer) if 1 in update_part: self.params.extend(conv_layer.params) self.delta_params.extend(conv_layer.delta_params) self.conv_output_dim = config['output_shape'][1] * config[ 'output_shape'][2] * config['output_shape'][3] for i in xrange(self.full_layer_num): # construct the sigmoidal layer if i == 0: input_size = self.conv_output_dim layer_input = self.conv_layers[-1].output else: input_size = hidden_layers_sizes[i - 1] layer_input = self.full_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=full_activation) # add the layer to our list of layers self.full_layers.append(sigmoid_layer) if 1 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(input=self.full_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.full_layers.append(self.logLayer) if 1 in update_part: self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y)
class CNN(CNNBase): """ Instantiation of Convolution neural network ... """ def __init__(self, numpy_rng, theano_rng, batch_size, n_outs,conv_layer_configs, hidden_layer_configs, use_fast=False,conv_activation = T.nnet.sigmoid,hidden_activation = T.nnet.sigmoid, l1_reg=None,l2_reg=None,max_col_norm=None): super(CNN, self).__init__(conv_layer_configs, hidden_layer_configs,l1_reg,l2_reg,max_col_norm) if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) for i in xrange(self.conv_layer_num): # construct the convolution layer if i == 0: #is_input layer input = self.x is_input_layer = True else: input = self.layers[-1].output #output of previous layer is_input_layer = False config = conv_layer_configs[i] conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input,input_shape=config['input_shape'], filter_shape=config['filter_shape'],poolsize=config['poolsize'], activation = conv_activation, use_fast = use_fast) self.layers.append(conv_layer) self.conv_layers.append(conv_layer) if config['update']==True: # only few layers of convolution layer are considered for updation self.params.extend(conv_layer.params) self.delta_params.extend(conv_layer.delta_params) hidden_layers = hidden_layer_configs['hidden_layers']; self.conv_output_dim = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3] adv_activation_configs = hidden_layer_configs['adv_activation'] #flattening the last convolution output layer self.features = self.conv_layers[-1].output.flatten(2); self.features_dim = self.conv_output_dim; for i in xrange(self.hidden_layer_num): # construct the hidden layer if i == 0: # is first sigmoidla layer input_size = self.conv_output_dim layer_input = self.features else: input_size = hidden_layers[i - 1] # number of hidden neurons in previous layers layer_input = self.layers[-1].output if adv_activation_configs is None: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input,n_in=input_size, n_out = hidden_layers[i], activation=hidden_activation); else: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input,n_in=input_size, n_out = hidden_layers[i]*adv_activation_configs['pool_size'], activation=hidden_activation, adv_activation_method = adv_activation_configs['method'], pool_size = adv_activation_configs['pool_size'], pnorm_order = adv_activation_configs['pnorm_order']); self.layers.append(sigmoid_layer) self.mlp_layers.append(sigmoid_layer) if config['update']==True: # only few layers of hidden layer are considered for updation self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) self.logLayer = LogisticRegression(input=self.layers[-1].output,n_in=hidden_layers[-1],n_out=n_outs) self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) self.output = self.logLayer.prediction() #regularization if self.l1_reg is not None: self.__l1Regularization__(self.hidden_layer_num*2); if self.l2_reg is not None: self.__l2Regularization__(self.hidden_layer_num*2); def save_mlp2dict(self,withfinal=True,max_layer_num=-1): if max_layer_num == -1: max_layer_num = self.hidden_layer_num mlp_dict = {} for i in range(max_layer_num): dict_a = str(i) +' W' mlp_dict[dict_a] = _array2string(self.mlp_layers[i].params[0].get_value()) dict_a = str(i) + ' b' mlp_dict[dict_a] = _array2string(self.mlp_layers[i].params[1].get_value()) if withfinal: dict_a = 'logreg W' mlp_dict[dict_a] = _array2string(self.logLayer.params[0].get_value()) dict_a = 'logreg b' mlp_dict[dict_a] = _array2string(self.logLayer.params[1].get_value()) return mlp_dict
def __init__(self, numpy_rng, theano_rng=None, batch_size = 256, n_outs=500, conv_layer_configs = [], hidden_layers_sizes=[500, 500], ivec_layers_sizes=[500, 500], conv_activation = T.nnet.sigmoid, full_activation = T.nnet.sigmoid, use_fast = False, update_part = [0, 1], ivec_dim = 100): self.conv_layers = [] self.full_layers = [] self.ivec_layers = [] self.params = [] self.delta_params = [] if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') input_shape = conv_layer_configs[0]['input_shape'] n_ins = input_shape[-1] * input_shape[-2] * input_shape[-3] self.iv = self.x[:,n_ins:n_ins+ivec_dim] self.raw = self.x[:,0:n_ins] self.conv_layer_num = len(conv_layer_configs) self.full_layer_num = len(hidden_layers_sizes) self.ivec_layer_num = len(ivec_layers_sizes) # construct the adaptation NN for i in xrange(self.ivec_layer_num): if i == 0: input_size = ivec_dim layer_input = self.iv else: input_size = ivec_layers_sizes[i - 1] layer_input = self.ivec_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=ivec_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.ivec_layers.append(sigmoid_layer) if 0 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) linear_func = lambda x: x sigmoid_layer = HiddenLayer(rng=numpy_rng, input=self.ivec_layers[-1].output, n_in=ivec_layers_sizes[-1], n_out=n_ins, activation=linear_func) self.ivec_layers.append(sigmoid_layer) if 0 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) for i in xrange(self.conv_layer_num): if i == 0: input = self.raw + self.ivec_layers[-1].output else: input = self.conv_layers[-1].output config = conv_layer_configs[i] conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input, input_shape = config['input_shape'], filter_shape = config['filter_shape'], poolsize = config['poolsize'], activation = conv_activation, flatten = config['flatten'], use_fast = use_fast) self.conv_layers.append(conv_layer) if 1 in update_part: self.params.extend(conv_layer.params) self.delta_params.extend(conv_layer.delta_params) self.conv_output_dim = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3] for i in xrange(self.full_layer_num): # construct the sigmoidal layer if i == 0: input_size = self.conv_output_dim layer_input = self.conv_layers[-1].output else: input_size = hidden_layers_sizes[i - 1] layer_input = self.full_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=full_activation) # add the layer to our list of layers self.full_layers.append(sigmoid_layer) if 1 in update_part: self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.full_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.full_layers.append(self.logLayer) if 1 in update_part: self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y)
def __init__(self, numpy_rng, theano_rng, batch_size, n_outs,conv_layer_configs, hidden_layer_configs, use_fast=False,conv_activation = T.nnet.sigmoid,hidden_activation = T.nnet.sigmoid, l1_reg=None,l2_reg=None,max_col_norm=None): super(DropoutCNN, self).__init__(conv_layer_configs,hidden_layer_configs,l1_reg,l2_reg,max_col_norm) if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) for i in xrange(self.conv_layer_num): # construct the convolution layer if i == 0: #is_input layer input = self.x is_input_layer = True else: input = self.layers[-1].output #output of previous layer is_input_layer = False config = conv_layer_configs[i] conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input,input_shape=config['input_shape'], filter_shape=config['filter_shape'],poolsize=config['poolsize'], activation = conv_activation, use_fast = use_fast) self.layers.append(conv_layer) self.conv_layers.append(conv_layer) if config['update']==True: # only few layers of convolution layer are considered for updation self.params.extend(conv_layer.params) self.delta_params.extend(conv_layer.delta_params) hidden_layers = hidden_layer_configs['hidden_layers']; self.conv_output_dim = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3] adv_activation_configs = hidden_layer_configs['adv_activation'] #flattening the last convolution output layer self.features = self.conv_layers[-1].output.flatten(2); self.features_dim = self.conv_output_dim; self.dropout_layers = []; self.dropout_factor = hidden_layer_configs['dropout_factor']; self.input_dropout_factor = hidden_layer_configs['input_dropout_factor']; for i in xrange(self.hidden_layer_num): # construct the hidden layer if i == 0: # is first sigmoidla layer input_size = self.conv_output_dim if self.dropout_factor[i] > 0.0: dropout_layer_input = _dropout_from_layer(theano_rng, self.layers[-1].output, self.input_dropout_factor) else: dropout_layer_input = self.features layer_input = self.features else: input_size = hidden_layers[i - 1] # number of hidden neurons in previous layers dropout_layer_input = self.dropout_layers[-1].dropout_output layer_input = (1 - self.dropout_factor[i-1]) * self.layers[-1].output if adv_activation_configs is None: dropout_sigmoid_layer = DropoutHiddenLayer(rng=numpy_rng, input=layer_input,n_in=input_size, n_out = hidden_layers[i], activation=hidden_activation, dropout_factor = self.dropout_factor[i]); sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input,n_in=input_size, n_out = hidden_layers[i], activation=hidden_activation, W=dropout_sigmoid_layer.W, b=dropout_sigmoid_layer.b); else: dropout_sigmoid_layer = DropoutHiddenLayer(rng=numpy_rng, input=layer_input,n_in=input_size, n_out = hidden_layers[i]*adv_activation_configs['pool_size'], activation=hidden_activation, adv_activation_method = adv_activation_configs['method'], pool_size = adv_activation_configs['pool_size'], pnorm_order = adv_activation_configs['pnorm_order'], dropout_factor = self.dropout_factor[i]); sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input,n_in=input_size, n_out = hidden_layers[i]*adv_activation_configs['pool_size'], activation=hidden_activation, adv_activation_method = adv_activation_configs['method'], pool_size = adv_activation_configs['pool_size'], pnorm_order = adv_activation_configs['pnorm_order'], W=dropout_sigmoid_layer.W, b=dropout_sigmoid_layer.b); self.layers.append(sigmoid_layer) self.dropout_layers.append(dropout_sigmoid_layer) self.mlp_layers.append(sigmoid_layer) if config['update']==True: # only few layers of hidden layer are considered for updation self.params.extend(dropout_sigmoid_layer.params) self.delta_params.extend(dropout_sigmoid_layer.delta_params) self.dropout_logLayer = LogisticRegression(input=self.dropout_layers[-1].dropout_output,n_in=hidden_layers[-1],n_out=n_outs) self.logLayer = LogisticRegression( input=(1 - self.dropout_factor[-1]) * self.layers[-1].output, n_in=hidden_layers[-1],n_out=n_outs, W=self.dropout_logLayer.W, b=self.dropout_logLayer.b) self.dropout_layers.append(self.dropout_logLayer) self.layers.append(self.logLayer) self.params.extend(self.dropout_logLayer.params) self.delta_params.extend(self.dropout_logLayer.delta_params) self.finetune_cost = self.dropout_logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) self.output = self.logLayer.prediction() #regularization if self.l1_reg is not None: self.__l1Regularization__(self.hidden_layer_num*2); if self.l2_reg is not None: self.__l2Regularization__(self.hidden_layer_num*2);
def __init__( self, numpy_rng, theano_rng=None, cfg=None, # the network configuration dnn_shared=None, shared_layers=[], input=None): self.layers = [] self.params = [] self.delta_params = [] self.cfg = cfg self.n_ins = cfg.n_ins self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout self.pool_size = cfg.pool_size self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg self.non_updated_layers = cfg.non_updated_layers if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.ivector('y') for i in xrange(self.hidden_layers_number): # construct the hidden layer if i == 0: input_size = self.n_ins layer_input = self.x else: input_size = self.hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output W = None b = None if (i in shared_layers): W = dnn_shared.layers[i].W b = dnn_shared.layers[i].b if self.do_maxout == True: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, W=W, b=b, activation=(lambda x: 1.0 * x), do_maxout=True, pool_size=self.pool_size) else: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W=W, b=b, activation=self.activation) # add the layer to our list of layers self.layers.append(hidden_layer) # if the layer index is included in self.non_updated_layers, parameters of this layer will not be updated if (i not in self.non_updated_layers): self.params.extend(hidden_layer.params) self.delta_params.extend(hidden_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(input=self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) if self.n_outs > 0: self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum()
class DNN_Dropout(object): def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, activation = T.nnet.sigmoid, input_dropout_factor = 0, dropout_factor = [0.2,0.2,0.2,0.2,0.2,0.2,0.2], do_maxout = False, pool_size = 1, max_col_norm = None, l1_reg = None, l2_reg = None): self.sigmoid_layers = [] self.dropout_layers = [] self.params = [] self.delta_params = [] self.n_layers = len(hidden_layers_sizes) self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg self.input_dropout_factor = input_dropout_factor self.dropout_factor = dropout_factor assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): # construct the sigmoidal layer if i == 0: input_size = n_ins layer_input = self.x if input_dropout_factor > 0.0: dropout_layer_input = _dropout_from_layer(theano_rng, self.x, input_dropout_factor) else: dropout_layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = (1 - self.dropout_factor[i - 1]) * self.sigmoid_layers[-1].output dropout_layer_input = self.dropout_layers[-1].dropout_output if do_maxout == False: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation= activation, dropout_factor=self.dropout_factor[i]) sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation, W=dropout_layer.W, b=dropout_layer.b) else: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation= (lambda x: 1.0*x), dropout_factor=self.dropout_factor[i], do_maxout = True, pool_size = pool_size) sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation= (lambda x: 1.0*x), W=dropout_layer.W, b=dropout_layer.b, do_maxout = True, pool_size = pool_size) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.dropout_layers.append(dropout_layer) self.params.extend(dropout_layer.params) self.delta_params.extend(dropout_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.dropout_logLayer = LogisticRegression( input=self.dropout_layers[-1].dropout_output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.logLayer = LogisticRegression( input=(1 - self.dropout_factor[-1]) * self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs, W=self.dropout_logLayer.W, b=self.dropout_logLayer.b) self.dropout_layers.append(self.dropout_logLayer) self.sigmoid_layers.append(self.logLayer) self.params.extend(self.dropout_logLayer.params) self.delta_params.extend(self.dropout_logLayer.delta_params) # compute the cost self.finetune_cost = self.dropout_logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.n_layers): W = self.params[i * 2] self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.n_layers): W = self.params[i * 2] self.finetune_cost += self.l2_reg * T.sqr(W).sum() def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = {} for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam*learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] if self.max_col_norm is not None: for i in xrange(self.n_layers): W = self.params[i * 2] if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) valid_fn = theano.function(inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) return train_fn, valid_fn
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, activation = T.nnet.sigmoid, do_maxout = False, pool_size = 1, do_pnorm = False, pnorm_order = 1, max_col_norm = None, l1_reg = None, l2_reg = None): self.sigmoid_layers = [] self.params = [] self.delta_params = [] self.n_layers = len(hidden_layers_sizes) self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): # construct the sigmoidal layer if i == 0: input_size = n_ins layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = self.sigmoid_layers[-1].output if do_maxout == True: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation = (lambda x: 1.0*x), do_maxout = True, pool_size = pool_size) elif do_pnorm == True: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation = (lambda x: 1.0*x), do_pnorm = True, pool_size = pool_size, pnorm_order = pnorm_order) else: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.sigmoid_layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.n_layers): W = self.params[i * 2] self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.n_layers): W = self.params[i * 2] self.finetune_cost += self.l2_reg * T.sqr(W).sum()
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, activation = T.nnet.sigmoid, input_dropout_factor = 0, dropout_factor = [0.2,0.2,0.2,0.2,0.2,0.2,0.2], do_maxout = False, pool_size = 1, max_col_norm = None, l1_reg = None, l2_reg = None): self.sigmoid_layers = [] self.dropout_layers = [] self.params = [] self.delta_params = [] self.n_layers = len(hidden_layers_sizes) self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg self.input_dropout_factor = input_dropout_factor self.dropout_factor = dropout_factor assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): # construct the sigmoidal layer if i == 0: input_size = n_ins layer_input = self.x if input_dropout_factor > 0.0: dropout_layer_input = _dropout_from_layer(theano_rng, self.x, input_dropout_factor) else: dropout_layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = (1 - self.dropout_factor[i - 1]) * self.sigmoid_layers[-1].output dropout_layer_input = self.dropout_layers[-1].dropout_output if do_maxout == False: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation= activation, dropout_factor=self.dropout_factor[i]) sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation, W=dropout_layer.W, b=dropout_layer.b) else: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation= (lambda x: 1.0*x), dropout_factor=self.dropout_factor[i], do_maxout = True, pool_size = pool_size) sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation= (lambda x: 1.0*x), W=dropout_layer.W, b=dropout_layer.b, do_maxout = True, pool_size = pool_size) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.dropout_layers.append(dropout_layer) self.params.extend(dropout_layer.params) self.delta_params.extend(dropout_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.dropout_logLayer = LogisticRegression( input=self.dropout_layers[-1].dropout_output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.logLayer = LogisticRegression( input=(1 - self.dropout_factor[-1]) * self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs, W=self.dropout_logLayer.W, b=self.dropout_logLayer.b) self.dropout_layers.append(self.dropout_logLayer) self.sigmoid_layers.append(self.logLayer) self.params.extend(self.dropout_logLayer.params) self.delta_params.extend(self.dropout_logLayer.delta_params) # compute the cost self.finetune_cost = self.dropout_logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.n_layers): W = self.params[i * 2] self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.n_layers): W = self.params[i * 2] self.finetune_cost += self.l2_reg * T.sqr(W).sum()
def __init__(self, numpy_rng, theano_rng=None, cfg = None, # the network configuration dnn_shared = None, shared_layers=[], input = None): self.layers = [] self.params = [] self.delta_params = [] self.rnn_layerX = 2 print "Use DRN" self.cfg = cfg self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.ivector('y') for i in xrange(self.hidden_layers_number): # construct the hidden layer if i == 0: input_size = self.n_ins layer_input = self.x else: input_size = self.hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output W = None; b = None if (i in shared_layers) : W = dnn_shared.layers[i].W; b = dnn_shared.layers[i].b if i == self.rnn_layerX: hidden_layer = RnnLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W = W, b = b, activation=self.activation) else: if self.do_maxout == True: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, W = W, b = b, activation = (lambda x: 1.0*x), do_maxout = True, pool_size = self.pool_size) else: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W = W, b = b, activation=self.activation) # add the layer to our list of layers self.layers.append(hidden_layer) self.params.extend(hidden_layer.params) self.delta_params.extend(hidden_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) if self.n_outs > 0: self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum()
class DBN(nnet): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, first_layer_gb = True,pretrainedLayers=None,activation=T.nnet.sigmoid): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type first_layer_gb: bool :param first_layer_gb: wether first layer is gausian-bernolli or bernolli-bernolli """ super(DBN, self).__init__() self.layers = [] self.rbm_layers = [] self.n_layers = len(hidden_layers_sizes) if pretrainedLayers == None: self.nPreTrainLayers = n_layers else : self.nPreTrainLayers = pretrainedLayers assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: input_size = n_ins layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.layers.append(sigmoid_layer) # the parameters of the sigmoid_layers are parameters of the DBN. # The visible biases in the RBM are parameters of those RBMs, # but not of the DBN. self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # Construct an RBM that shared weights with this layer # the first layer could be Gaussian-Bernoulli RBM # other layers are Bernoulli-Bernoulli RBMs if i == 0 and first_layer_gb: rbm_layer = GBRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) else: rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.output = self.logLayer.prediction(); self.features = self.layers[-2].output; self.features_dim = self.layers[-2].n_out def pretraining_functions(self, train_set_x, batch_size, weight_cost): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param weight_cost: weigth cost ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch momentum = T.scalar('momentum') learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None,k=1) for training each RBM. r_cost, fe_cost, updates = rbm.get_cost_updates(batch_size, learning_rate, momentum, weight_cost) # compile the theano function fn = theano.function(inputs=[index, theano.Param(learning_rate, default=0.0001), theano.Param(momentum, default=0.5)], outputs= [r_cost, fe_cost], updates=updates, givens={self.x: train_set_x[batch_begin:batch_end]}) # append function to the list of functions pretrain_fns.append(fn) return pretrain_fns
def __init__(self, numpy_rng, theano_rng=None, cfg = None, dnn_shared = None, shared_layers=[]): self.layers = [] self.dropout_layers = [] self.params = [] self.delta_params = [] self.cfg = cfg self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size self.input_dropout_factor = cfg.input_dropout_factor; self.dropout_factor = cfg.dropout_factor self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in range(self.hidden_layers_number): # construct the hidden layer if i == 0: input_size = self.n_ins layer_input = self.x if self.input_dropout_factor > 0.0: dropout_layer_input = _dropout_from_layer(theano_rng, self.x, self.input_dropout_factor) else: dropout_layer_input = self.x else: input_size = self.hidden_layers_sizes[i - 1] layer_input = (1 - self.dropout_factor[i - 1]) * self.layers[-1].output dropout_layer_input = self.dropout_layers[-1].dropout_output W = None; b = None if (i in shared_layers) : W = dnn_shared.layers[i].W; b = dnn_shared.layers[i].b if self.do_maxout == False: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W = W, b = b, activation= self.activation, dropout_factor=self.dropout_factor[i]) hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], activation= self.activation, W=dropout_layer.W, b=dropout_layer.b) else: dropout_layer = DropoutHiddenLayer(rng=numpy_rng, input=dropout_layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, W = W, b = b, activation= (lambda x: 1.0*x), dropout_factor=self.dropout_factor[i], do_maxout = True, pool_size = self.pool_size) hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, activation= (lambda x: 1.0*x), W=dropout_layer.W, b=dropout_layer.b, do_maxout = True, pool_size = self.pool_size) # add the layer to our list of layers self.layers.append(hidden_layer) self.dropout_layers.append(dropout_layer) self.params.extend(dropout_layer.params) self.delta_params.extend(dropout_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.dropout_logLayer = LogisticRegression( input=self.dropout_layers[-1].dropout_output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) self.logLayer = LogisticRegression( input=(1 - self.dropout_factor[-1]) * self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs, W=self.dropout_logLayer.W, b=self.dropout_logLayer.b) self.dropout_layers.append(self.dropout_logLayer) self.layers.append(self.logLayer) self.params.extend(self.dropout_logLayer.params) self.delta_params.extend(self.dropout_logLayer.delta_params) # compute the cost self.finetune_cost = self.dropout_logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in range(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in range(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum()
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, first_layer_gb = True,pretrainedLayers=None,activation=T.nnet.sigmoid): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type first_layer_gb: bool :param first_layer_gb: wether first layer is gausian-bernolli or bernolli-bernolli """ super(DBN, self).__init__() self.layers = [] self.rbm_layers = [] self.n_layers = len(hidden_layers_sizes) if pretrainedLayers == None: self.nPreTrainLayers = n_layers else : self.nPreTrainLayers = pretrainedLayers assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: input_size = n_ins layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.layers.append(sigmoid_layer) # the parameters of the sigmoid_layers are parameters of the DBN. # The visible biases in the RBM are parameters of those RBMs, # but not of the DBN. self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # Construct an RBM that shared weights with this layer # the first layer could be Gaussian-Bernoulli RBM # other layers are Bernoulli-Bernoulli RBMs if i == 0 and first_layer_gb: rbm_layer = GBRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) else: rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.output = self.logLayer.prediction(); self.features = self.layers[-2].output; self.features_dim = self.layers[-2].n_out
def __init__(self, numpy_rng, theano_rng=None, cfg = None, # the network configuration dnn_shared = None, shared_layers=[], input = None): self.cfg = cfg self.params = [] self.delta_params = [] self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size self.max_col_norm = cfg.max_col_norm print self.max_col_norm self.layers = [] self.bilayers = [] self.lstm_layers = [] self.fc_layers = [] # 1. lstm self.lstm_layers_sizes = cfg.lstm_layers_sizes self.lstm_layers_number = len(self.lstm_layers_sizes) # 2. dnn self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.ivector('y') ####################### # build lstm layers # ####################### print '1. start to build lstm layer: '+ str(self.lstm_layers_number) for i in xrange(self.lstm_layers_number): if i == 0: input_size = self.n_ins input = self.x else: input_size = self.lstm_layers_sizes[i - 1] input = self.bilayers[-1].output # Forward f_lstm_layer = LSTMLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.lstm_layers_sizes[i]) print '\tbuild f_lstm layer: ' + str(input_size) +' x '+ str(f_lstm_layer.n_out) self.layers.append(f_lstm_layer) self.lstm_layers.append(f_lstm_layer) self.params.extend(f_lstm_layer.params) self.delta_params.extend(f_lstm_layer.delta_params) # Backward b_lstm_layer = LSTMLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.lstm_layers_sizes[i], backwards=True) print '\tbuild b_lstm layer: ' + str(input_size) +' x '+ str(b_lstm_layer.n_out) self.layers.append(b_lstm_layer) self.lstm_layers.append(b_lstm_layer) self.params.extend(b_lstm_layer.params) self.delta_params.extend(b_lstm_layer.delta_params) # Sum forward + backward bi_layer = SUMLayer(finput=f_lstm_layer.output,binput=b_lstm_layer.output[::-1], n_out=self.lstm_layers_sizes[i - 1]) self.bilayers.append(bi_layer) print '\tbuild sum layer: ' + str(input_size) +' x '+ str(bi_layer.n_out) print '1. finish lstm layer: '+ str(self.bilayers[-1].n_out) ####################### # build log layers # ####################### print '3. start to build log layer: 1' input_size = self.bilayers[-1].n_out input = self.bilayers[-1].output logLayer = LogisticRegression(input=input, n_in=input_size, n_out=self.n_outs) print '\tbuild final layer: ' + str(input_size) +' x '+ str(self.n_outs) self.layers.append(logLayer) self.params.extend(logLayer.params) self.delta_params.extend(logLayer.delta_params) print '3. finish log layer: '+ str(self.bilayers[-1].n_out) print 'Total layers: '+ str(len(self.layers)) sys.stdout.flush() self.finetune_cost = logLayer.negative_log_likelihood(self.y) self.errors = logLayer.errors(self.y)
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, activation = T.nnet.sigmoid, adv_activation = None, max_col_norm = None, l1_reg = None, l2_reg = None): super(DNN, self).__init__() self.layers = [] self.n_layers = len(hidden_layers_sizes) self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): # construct the sigmoidal layer if i == 0: input_size = n_ins layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output if not adv_activation is None: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i] * pool_size, activation = activation, adv_activation_method = adv_activation['method'], pool_size = adv_activation['pool_size'], pnorm_order = adv_activation['pnorm_order']) else: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: self.__l1Regularization__(); if self.l2_reg is not None: self.__l2Regularization__(); self.output = self.logLayer.prediction(); self.features = self.layers[-2].output; self.features_dim = self.layers[-2].n_out
def __init__(self, numpy_rng, theano_rng=None, upper_hidden_layers_sizes=[500, 500], n_outs=10, tower1_hidden_layers_sizes=[500, 500], tower1_n_ins = 100, tower2_hidden_layers_sizes=[500, 500], tower2_n_ins = 100, activation = T.nnet.sigmoid, do_maxout = False, pool_size = 1, do_pnorm = False, pnorm_order = 1, max_col_norm = None, l1_reg = None, l2_reg = None): self.tower1_layers = [] self.tower2_layers = [] self.upper_layers = [] self.params = [] self.delta_params = [] self.max_col_norm = max_col_norm self.l1_reg = l1_reg self.l2_reg = l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') self.tower1_input = self.x[:,0:tower1_n_ins] self.tower2_input = self.x[:,tower1_n_ins:(tower1_n_ins + tower2_n_ins)] # build tower1 for i in xrange(len(tower1_hidden_layers_sizes)): if i == 0: input_size = tower1_n_ins layer_input = self.tower1_input else: input_size = tower1_hidden_layers_sizes[i - 1] layer_input = self.tower1_layers[-1].output layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=tower1_hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.tower1_layers.append(layer) self.params.extend(layer.params) self.delta_params.extend(layer.delta_params) # build tower2 for i in xrange(len(tower2_hidden_layers_sizes)): if i == 0: input_size = tower2_n_ins layer_input = self.tower2_input else: input_size = tower2_hidden_layers_sizes[i - 1] layer_input = self.tower2_layers[-1].output layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=tower2_hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.tower2_layers.append(layer) self.params.extend(layer.params) self.delta_params.extend(layer.delta_params) for i in xrange(len(upper_hidden_layers_sizes)): # construct the sigmoidal layer if i == 0: input_size = tower1_hidden_layers_sizes[-1] + tower2_hidden_layers_sizes[-1] layer_input = T.concatenate([self.tower1_layers[-1].output, self.tower2_layers[-1].output], axis=1) else: input_size = upper_hidden_layers_sizes[i - 1] layer_input = self.upper_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=upper_hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.upper_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.upper_layers[-1].output, n_in=upper_hidden_layers_sizes[-1], n_out=n_outs) self.upper_layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y)
class CNN(object): def __init__(self, numpy_rng, theano_rng=None, batch_size = 256, n_outs=500, sparsity = None, sparsity_weight = None, sparse_layer = 3, conv_layer_configs = [], hidden_layers_sizes=[500, 500], conv_activation = T.nnet.sigmoid, full_activation = T.nnet.sigmoid, use_fast = False): self.layers = [] self.params = [] self.delta_params = [] self.sparsity = sparsity self.sparsity_weight = sparsity_weight self.sparse_layer = sparse_layer if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.ivector('y') self.conv_layer_num = len(conv_layer_configs) self.full_layer_num = len(hidden_layers_sizes) for i in xrange(self.conv_layer_num): if i == 0: input = self.x is_input_layer = True else: input = self.layers[-1].output is_input_layer = False config = conv_layer_configs[i] conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input, is_input_layer = is_input_layer, input_shape = config['input_shape'], filter_shape = config['filter_shape'], poolsize = config['poolsize'], activation = conv_activation, flatten = config['flatten']) self.layers.append(conv_layer) self.params.extend(conv_layer.params) self.delta_params.extend(conv_layer.delta_params) self.conv_output_dim = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3] for i in xrange(self.full_layer_num): # construct the sigmoidal layer if i == 0: input_size = self.conv_output_dim else: input_size = hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=full_activation) # add the layer to our list of layers self.layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) if self.sparsity_weight is not None: sparsity_level = T.extra_ops.repeat(self.sparsity, 630) avg_act = self.sigmoid_layers[sparse_layer].output.mean(axis=0) kl_div = self.kl_divergence(sparsity_level, avg_act) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) + self.sparsity_weight * kl_div.sum() else: self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) def kl_divergence(self, p, p_hat): return p * T.log(p / p_hat) + (1 - p) * T.log((1 - p) / (1 - p_hat)) def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = collections.OrderedDict() for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam*learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) valid_fn = theano.function(inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) return train_fn, valid_fn