def train_conv_net(datasets, U, img_w=300, filter_hs=[3, 4, 5], hidden_units=[100, 2], dropout_rate=[0.5], shuffle_batch=True, n_epochs=25, batch_size=50, lr_decay=0.95, conv_non_linear="relu", activations=[Iden], sqr_norm_lim=9, non_static=True): """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer sqr_norm_lim = s^2 in the paper lr_decay = adadelta decay parameter """ rng = np.random.RandomState(3435) img_h = len(datasets[0][0]) - 1 filter_w = img_w feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) parameters = [("image shape", img_h, img_w), ("filter shape", filter_shapes), ("hidden_units", hidden_units), ("dropout", dropout_rate), ("batch_size", batch_size), ("non_static", non_static), ("learn_decay", lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static), ("sqr_norm_lim", sqr_norm_lim), ("shuffle_batch", shuffle_batch)] print parameters #define model architecture index = T.lscalar() x = T.matrix('x') y = T.ivector('y') Words = theano.shared(value=U, name="Words") zero_vec_tensor = T.vector() zero_vec = np.zeros(img_w) set_zero = theano.function([zero_vec_tensor], updates=[ (Words, T.set_subtensor(Words[0, :], zero_vec_tensor)) ], allow_input_downcast=True) layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0], 1, x.shape[1], Words.shape[1])) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_h, img_w), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) layer1_input = conv_layer.output.flatten(2) conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) layer1_input = T.concatenate(layer1_inputs, 1) hidden_units[0] = feature_maps * len(filter_hs) classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rates=dropout_rate) #define parameters of the model and update functions using adadelta params = classifier.params for conv_layer in conv_layers: params += conv_layer.params if non_static: #if word vectors are allowed to change, add them as model parameters params += [Words] cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate #extra data (at random) np.random.seed(3435) if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(datasets[0]) extra_data = train_set[:extra_data_num] new_data = np.append(datasets[0], extra_data, axis=0) else: new_data = datasets[0] new_data = np.random.permutation(new_data) n_batches = new_data.shape[0] / batch_size n_train_batches = int(np.round(n_batches * 0.9)) #divide train set into train/val sets test_set_x = datasets[1][:, :img_h] test_set_y = np.asarray(datasets[1][:, -1], "int32") train_set = new_data[:n_train_batches * batch_size, :] val_set = new_data[n_train_batches * batch_size:, :] train_set_x, train_set_y = shared_dataset( (train_set[:, :img_h], train_set[:, -1])) val_set_x, val_set_y = shared_dataset((val_set[:, :img_h], val_set[:, -1])) n_val_batches = n_batches - n_train_batches val_model = theano.function( [index], classifier.errors(y), givens={ x: val_set_x[index * batch_size:(index + 1) * batch_size], y: val_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) #compile theano functions to get train/val/test errors test_model = theano.function( [index], classifier.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) train_model = theano.function( [index], cost, updates=grad_updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) test_pred_layers = [] test_size = test_set_x.shape[0] test_layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (test_size, 1, img_h, Words.shape[1])) for conv_layer in conv_layers: test_layer0_output = conv_layer.predict(test_layer0_input, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) test_layer1_input = T.concatenate(test_pred_layers, 1) test_y_pred = classifier.predict(test_layer1_input) test_error = T.mean(T.neq(test_y_pred, y)) test_model_all = theano.function([x, y], test_error, allow_input_downcast=True) #start training over mini-batches print '... training' epoch = 0 best_val_perf = 0 val_perf = 0 test_perf = 0 cost_epoch = 0 while (epoch < n_epochs): epoch = epoch + 1 if shuffle_batch: for minibatch_index in np.random.permutation( range(n_train_batches)): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) else: for minibatch_index in xrange(n_train_batches): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) train_losses = [test_model(i) for i in xrange(n_train_batches)] train_perf = 1 - np.mean(train_losses) val_losses = [val_model(i) for i in xrange(n_val_batches)] val_perf = 1 - np.mean(val_losses) print('epoch %i, train perf %f %%, val perf %f' % (epoch, train_perf * 100., val_perf * 100.)) if val_perf >= best_val_perf: best_val_perf = val_perf test_loss = test_model_all(test_set_x, test_set_y) test_perf = 1 - test_loss return test_perf, params
val_set = new_data[n_train_batches * batch_size:, :] train_set_x = theano.shared(np.asarray(train_set[:, :img_h], dtype=theano.config.floatX), borrow=True) train_set_y_before_cast = theano.shared(np.asarray( train_set[:, -1], dtype=theano.config.floatX), borrow=True) train_set_y = T.cast(train_set_y_before_cast, 'int32') # train_set_x, train_set_y = shared_dataset((train_set[:,:img_h],train_set[:,-1])) val_set_x, val_set_y = shared_dataset((val_set[:, :img_h], val_set[:, -1])) n_val_batches = n_batches - n_train_batches val_model = theano.function( [index], classifier.errors(y), givens={ x: val_set_x[index * batch_size:(index + 1) * batch_size], y: val_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) # compile theano functions to get train/val/test errors test_model = theano.function( [index], classifier.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) train_model = theano.function(
def train_pos_cnn(datasets, W, P, filter_hs, hidden_units, dropout_rates, n_epochs, batch_size, lr_decay, conv_non_linear, activations, sqr_norm_lim, model): # print params parameters = [("num_filters", hidden_units[0]), ("num_classes", hidden_units[1]), ("filter_types", filter_hs), ("dropout", dropout_rates), ("num_epochs", n_epochs), ("batch_size", batch_size), ("learn_decay", lr_decay), ("conv_non_linear", conv_non_linear), ("sqr_norm_lim", sqr_norm_lim), ("model", model)] print parameters ########################## # model architecture # ########################## print 'building the model architecture...' index = T.lscalar() x = T.matrix('x') # words y = T.ivector('y') # labels z = T.matrix('z') # tags curr_batch_size = T.lscalar() is_train = T.iscalar('is_train') # 1=train, 0=test # set necessary variables rng = np.random.RandomState(3435) img_h = (len(datasets[0][0]) - 1) / 2 # input height = seq len feature_maps = hidden_units[0] # num filters # EMBEDDING LAYER embedding_layer = EmbeddingLayer(rng, is_train, x, z, curr_batch_size, img_h, W, P, model, dropout_rates[0]) layer0_input = embedding_layer.output img_w = embedding_layer.final_token_dim # img w = filter width = input matrix width # set more variables filter_w = img_w # filter width = input matrix width # construct filter shapes and pool sizes filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1)) # CONV-POOL LAYER conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_shapes)): conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(None, 1, img_h, img_w), filter_shape=filter_shapes[i], poolsize=pool_sizes[i], non_linear=conv_non_linear) layer1_inputs.append(conv_layer.output.flatten(2)) conv_layers.append(conv_layer) layer1_input = T.concatenate(layer1_inputs, 1) hidden_units[0] = feature_maps * len(filter_shapes) # update the hidden units # OUTPUT LAYER (Dropout, Fully-Connected, Soft-Max) classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rate=dropout_rates[1]) # UPDATE params = classifier.params + embedding_layer.params for conv_layer in conv_layers: params += conv_layer.params cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) # use this to update grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) ########################## # dataset handling # ########################## print 'handling dataset...' # train # if len(datasets[0]) % batch_size != 0: # datasets[0] = np.random.permutation(datasets[0]) # to_add = batch_size - len(datasets[0]) % batch_size # datasets[0] = np.concatenate((datasets[0], datasets[0][:to_add])) train_set_x, train_set_y, train_set_z = \ shared_dataset((datasets[0][:, :img_h], datasets[0][:, -1], datasets[0][:, img_h:2*img_h])) n_train_batches = int(len(datasets[0]) / batch_size) if len(datasets[0]) % batch_size > 0: n_train_batches += 1 # val # if len(datasets[1]) % batch_size != 0: # datasets[1] = np.random.permutation(datasets[1]) # to_add = batch_size - len(datasets[1]) % batch_size # datasets[1] = np.concatenate((datasets[1], datasets[1][:to_add])) val_set_x, val_set_y, val_set_z = \ shared_dataset((datasets[1][:, :img_h], datasets[1][:, -1], datasets[1][:, img_h:2*img_h])) n_val_batches = int(len(datasets[1]) / batch_size) if len(datasets[1]) % batch_size > 0: n_val_batches += 1 # test test_set_x, test_set_y, test_set_z = \ shared_dataset((datasets[2][:, :img_h], datasets[2][:, -1], datasets[2][:, img_h:2*img_h])) n_test_batches = int(len(datasets[2]) / batch_size) if len(datasets[2]) % batch_size > 0: n_test_batches += 1 ########################## # theano functions # ########################## print 'preparing theano functions...' zero_vec_tensor = T.vector() set_zero_word = theano.function([zero_vec_tensor], updates=[(embedding_layer.Words, T.set_subtensor(embedding_layer.Words[0, :], zero_vec_tensor))], allow_input_downcast=True) if model != 'notag': set_zero_pos = theano.function([zero_vec_tensor], updates=[(embedding_layer.Tags, T.set_subtensor(embedding_layer.Tags[0, :], zero_vec_tensor))], allow_input_downcast=True) val_model = theano.function([index, curr_batch_size], classifier.errors(y), givens={ x: val_set_x[index * batch_size: (index + 1) * batch_size], y: val_set_y[index * batch_size: (index + 1) * batch_size], z: val_set_z[index * batch_size: (index + 1) * batch_size], is_train: np.cast['int32'](0)}, allow_input_downcast=True, on_unused_input='ignore') train_eval_model = theano.function([index, curr_batch_size], classifier.errors(y), givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size], z: train_set_z[index * batch_size: (index + 1) * batch_size], is_train: np.cast['int32'](0)}, allow_input_downcast=True, on_unused_input='ignore') train_model = theano.function([index, curr_batch_size], cost, updates=grad_updates, givens={ x: train_set_x[index*batch_size:(index+1)*batch_size], y: train_set_y[index*batch_size:(index+1)*batch_size], z: train_set_z[index*batch_size:(index+1)*batch_size], is_train: np.cast['int32'](1)}, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([index, curr_batch_size], classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size], z: test_set_z[index * batch_size: (index + 1) * batch_size], is_train: np.cast['int32'](0)}, allow_input_downcast=True, on_unused_input='ignore') ########################## # training # ########################## print 'training...' epoch = 0 best_val_perf = 0 best_test_perf = 0 best_epoch = 0 num_epochs_decrease = 0 prev_val_perf = 0 while epoch < n_epochs: start_time = time.time() epoch += 1 step = 1 for minibatch_index in np.random.permutation(range(n_train_batches)): cost = train_model(minibatch_index, min(batch_size, len(datasets[0])-minibatch_index*batch_size)) set_zero_word(np.zeros(W.shape[1])) if model != 'notag': set_zero_pos(np.zeros(P.shape[1])) step += 1 train_losses = [train_eval_model(i, min(batch_size, len(datasets[0])-i*batch_size)) for i in xrange(n_train_batches)] train_perf = 1 - np.mean(train_losses) val_losses = [val_model(i, min(batch_size, len(datasets[1])-i*batch_size)) for i in xrange(n_val_batches)] val_perf = 1 - np.mean(val_losses) test_losses = [test_model(i, min(batch_size, len(datasets[2])-i*batch_size)) for i in xrange(n_test_batches)] test_loss = np.mean(test_losses) test_perf = 1 - test_loss print 'epoch: {}, time: {} secs, train: {}, val: {}, test: {}'\ .format(epoch, time.time() - start_time, train_perf * 100., val_perf * 100., test_perf * 100.) if val_perf > best_val_perf or (val_perf == best_val_perf and test_perf > best_test_perf): best_val_perf = val_perf best_test_perf = test_perf best_epoch = epoch # early stop if val_perf < prev_val_perf: num_epochs_decrease += 1 else: num_epochs_decrease = 0 if num_epochs_decrease >= 3: break prev_val_perf = val_perf return best_test_perf, best_val_perf, best_epoch
def train_conv_net(datasets, U, img_w=300, filter_hs=[3,4,5], hidden_units=[100,2], dropout_rate=[0.5], shuffle_batch=True, n_epochs=25, batch_size=50, lr_decay = 0.95, conv_non_linear="relu", activations=[Iden], sqr_norm_lim=9, non_static=True): """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer sqr_norm_lim = s^2 in the paper lr_decay = adadelta decay parameter """ rng = np.random.RandomState(3435) img_h = len(datasets[0][0])-1 filter_w = img_w feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1)) parameters = [("image shape",img_h,img_w),("filter shape",filter_shapes), ("hidden_units",hidden_units), ("dropout", dropout_rate), ("batch_size",batch_size),("non_static", non_static), ("learn_decay",lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static) ,("sqr_norm_lim",sqr_norm_lim),("shuffle_batch",shuffle_batch)] print parameters #define model architecture index = T.lscalar() x = T.matrix('x') y = T.ivector('y') Words = theano.shared(value = U, name = "Words") zero_vec_tensor = T.vector() zero_vec = np.zeros(img_w) set_zero = theano.function([zero_vec_tensor], updates=[(Words, T.set_subtensor(Words[0,:], zero_vec_tensor))],allow_input_downcast=True) layer0_input = Words[T.cast(x.flatten(),dtype="int32")].reshape((x.shape[0],1,x.shape[1],Words.shape[1])) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng, input=layer0_input,image_shape=(batch_size, 1, img_h, img_w), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) layer1_input = conv_layer.output.flatten(2) conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) layer1_input = T.concatenate(layer1_inputs,1) hidden_units[0] = feature_maps*len(filter_hs) classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rates=dropout_rate) #define parameters of the model and update functions using adadelta params = classifier.params for conv_layer in conv_layers: params += conv_layer.params if non_static: #if word vectors are allowed to change, add them as model parameters params += [Words] cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate #extra data (at random) np.random.seed(3435) if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(datasets[0]) extra_data = train_set[:extra_data_num] new_data=np.append(datasets[0],extra_data,axis=0) else: new_data = datasets[0] new_data = np.random.permutation(new_data) n_batches = new_data.shape[0]/batch_size n_train_batches = int(np.round(n_batches*0.9)) #divide train set into train/val sets test_set_x = datasets[1][:,:img_h] test_set_y = np.asarray(datasets[1][:,-1],"int32") train_set = new_data[:n_train_batches*batch_size,:] val_set = new_data[n_train_batches*batch_size:,:] train_set_x, train_set_y = shared_dataset((train_set[:,:img_h],train_set[:,-1])) val_set_x, val_set_y = shared_dataset((val_set[:,:img_h],val_set[:,-1])) n_val_batches = n_batches - n_train_batches val_model = theano.function([index], classifier.errors(y), givens={ x: val_set_x[index * batch_size: (index + 1) * batch_size], y: val_set_y[index * batch_size: (index + 1) * batch_size]},allow_input_downcast=True) #compile theano functions to get train/val/test errors test_model = theano.function([index], classifier.errors(y), givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]},allow_input_downcast=True) train_model = theano.function([index], cost, updates=grad_updates, givens={ x: train_set_x[index*batch_size:(index+1)*batch_size], y: train_set_y[index*batch_size:(index+1)*batch_size]},allow_input_downcast=True) test_pred_layers = [] test_size = test_set_x.shape[0] test_layer0_input = Words[T.cast(x.flatten(),dtype="int32")].reshape((test_size,1,img_h,Words.shape[1])) for conv_layer in conv_layers: test_layer0_output = conv_layer.predict(test_layer0_input, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) test_layer1_input = T.concatenate(test_pred_layers, 1) test_y_pred = classifier.predict(test_layer1_input) test_error = T.mean(T.neq(test_y_pred, y)) test_model_all = theano.function([x,y], test_error,allow_input_downcast=True) #start training over mini-batches print '... training' epoch = 0 best_val_perf = 0 val_perf = 0 test_perf = 0 cost_epoch = 0 while (epoch < n_epochs): epoch = epoch + 1 if shuffle_batch: for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) else: for minibatch_index in xrange(n_train_batches): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) train_losses = [test_model(i) for i in xrange(n_train_batches)] train_perf = 1 - np.mean(train_losses) val_losses = [val_model(i) for i in xrange(n_val_batches)] val_perf = 1- np.mean(val_losses) print('epoch %i, train perf %f %%, val perf %f' % (epoch, train_perf * 100., val_perf*100.)) if val_perf >= best_val_perf: best_val_perf = val_perf test_loss = test_model_all(test_set_x,test_set_y) test_perf = 1- test_loss return test_perf, params
def train_conv_net(datasets, U, ofile, cv=0, attr=0, img_w=300, filter_hs=[3, 4, 5], hidden_units=[100, 2], dropout_rate=[0.5], shuffle_batch=True, n_epochs=25, batch_size=50, lr_decay=0.95, conv_non_linear="relu", activations=[Iden], sqr_norm_lim=9, non_static=True): """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer sqr_norm_lim = s^2 in the paper lr_decay = adadelta decay parameter """ rng = np.random.RandomState(3435) img_h = len(datasets[0][0][0]) filter_w = img_w feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) parameters = [("image shape", img_h, img_w), ("filter shape", filter_shapes), ("hidden_units", hidden_units), ("dropout", dropout_rate), ("batch_size", batch_size), ("non_static", non_static), ("learn_decay", lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static), ("sqr_norm_lim", sqr_norm_lim), ("shuffle_batch", shuffle_batch)] print(parameters) # define model architecture index = T.iscalar() x = T.tensor3('x', dtype=theano.config.floatX) y = T.ivector('y') mair = T.matrix('mair') Words = theano.shared(value=U, name="Words") zero_vec_tensor = T.vector(dtype=theano.config.floatX) zero_vec = np.zeros(img_w, dtype=theano.config.floatX) set_zero = theano.function([zero_vec_tensor], updates=[ (Words, T.set_subtensor(Words[0, :], zero_vec_tensor)) ], allow_input_downcast=True) conv_layers = [] for i in range(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng, image_shape=None, filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) conv_layers.append(conv_layer) layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0], x.shape[1], x.shape[2], Words.shape[1])) def convolve_user_statuses(statuses): layer1_inputs = [] def sum_mat(mat, out): z = ifelse( T.neq(T.sum(mat, dtype=theano.config.floatX), T.constant(0, dtype=theano.config.floatX)), T.constant(1, dtype=theano.config.floatX), T.constant(0, dtype=theano.config.floatX)) return out + z, theano.scan_module.until( T.eq(z, T.constant(0, dtype=theano.config.floatX))) status_count, _ = theano.scan(fn=sum_mat, sequences=statuses, outputs_info=T.constant( 0, dtype=theano.config.floatX)) # Slice-out dummy (zeroed) sentences relv_input = statuses[:T.cast(status_count[-1], dtype='int32' )].dimshuffle(0, 'x', 1, 2) for conv_layer in conv_layers: layer1_inputs.append( conv_layer.set_input(input=relv_input).flatten(2)) features = T.concatenate(layer1_inputs, axis=1) avg_feat = T.max(features, axis=0) return avg_feat conv_feats, _ = theano.scan(fn=convolve_user_statuses, sequences=layer0_input) # Add Mairesse features layer1_input = T.concatenate([conv_feats, mair], axis=1) ##mairesse_change hidden_units[0] = feature_maps * len(filter_hs) + datasets[4].shape[ 1] ##mairesse_change classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rates=dropout_rate) svm_data = T.concatenate( [classifier.layers[0].output, y.dimshuffle(0, 'x')], axis=1) # define parameters of the model and update functions using adadelta params = classifier.params for conv_layer in conv_layers: params += conv_layer.params if non_static: # if word vectors are allowed to change, add them as model parameters params += [Words] cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) # shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate # extra data (at random) np.random.seed(3435) if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size rand_perm = np.random.permutation(range(len(datasets[0]))) train_set_x = datasets[0][rand_perm] train_set_y = datasets[1][rand_perm] train_set_m = datasets[4][rand_perm] extra_data_x = train_set_x[:extra_data_num] extra_data_y = train_set_y[:extra_data_num] extra_data_m = train_set_m[:extra_data_num] new_data_x = np.append(datasets[0], extra_data_x, axis=0) new_data_y = np.append(datasets[1], extra_data_y, axis=0) new_data_m = np.append(datasets[4], extra_data_m, axis=0) else: new_data_x = datasets[0] new_data_y = datasets[1] new_data_m = datasets[4] rand_perm = np.random.permutation(range(len(new_data_x))) new_data_x = new_data_x[rand_perm] new_data_y = new_data_y[rand_perm] new_data_m = new_data_m[rand_perm] n_batches = new_data_x.shape[0] / batch_size n_train_batches = int(np.round(n_batches * 0.9)) # divide train set into train/val sets test_set_x = datasets[2] test_set_y = np.asarray(datasets[3], "int32") test_set_m = datasets[5] train_set_x, train_set_y, train_set_m = shared_dataset( (new_data_x[:n_train_batches * batch_size], new_data_y[:n_train_batches * batch_size], new_data_m[:n_train_batches * batch_size])) val_set_x, val_set_y, val_set_m = shared_dataset( (new_data_x[n_train_batches * batch_size:], new_data_y[n_train_batches * batch_size:], new_data_m[n_train_batches * batch_size:])) n_val_batches = n_batches - n_train_batches val_model = theano.function( [index], classifier.errors(y), givens={ x: val_set_x[index * batch_size:(index + 1) * batch_size], y: val_set_y[index * batch_size:(index + 1) * batch_size], mair: val_set_m[index * batch_size:(index + 1) * batch_size] }, ##mairesse_change allow_input_downcast=False) # compile theano functions to get train/val/test errors test_model = theano.function( [index], [classifier.errors(y), svm_data], givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size], mair: train_set_m[index * batch_size:(index + 1) * batch_size] }, ##mairesse_change allow_input_downcast=True) train_model = theano.function( [index], cost, updates=grad_updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size], mair: train_set_m[index * batch_size:(index + 1) * batch_size] }, ##mairesse_change allow_input_downcast=True) test_y_pred = classifier.predict(layer1_input) test_error = T.sum(T.neq(test_y_pred, y), dtype=theano.config.floatX) true_p = T.sum(test_y_pred * y, dtype=theano.config.floatX) false_p = T.sum(test_y_pred * T.mod(y + T.ones_like(y, dtype=theano.config.floatX), T.constant(2, dtype='int32'))) false_n = T.sum( y * T.mod(test_y_pred + T.ones_like(y, dtype=theano.config.floatX), T.constant(2, dtype='int32'))) test_model_all = theano.function( [ x, y, mair ##mairesse_change ], [test_error, true_p, false_p, false_n, svm_data], allow_input_downcast=True) test_batches = test_set_x.shape[0] / batch_size # start training over mini-batches print('... training') epoch = 0 best_val_perf = 0 val_perf = 0 test_perf = 0 fscore = 0 cost_epoch = 0 while (epoch < n_epochs): start_time = time.time() epoch = epoch + 1 if shuffle_batch: for minibatch_index in np.random.permutation( range(n_train_batches)): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) else: for minibatch_index in range(int(n_train_batches)): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) train_losses = [test_model(i) for i in range(int(n_train_batches))] train_perf = 1 - np.mean([loss[0] for loss in train_losses]) val_losses = [val_model(i) for i in range(int(n_val_batches))] val_perf = 1 - np.mean(val_losses) epoch_perf = 'epoch: %i, training time: %.2f secs, train perf: %.2f %%, val perf: %.2f %%' % ( epoch, time.time() - start_time, train_perf * 100., val_perf * 100.) print(epoch_perf) ofile.write(epoch_perf + "\n") ofile.flush() if val_perf >= best_val_perf: best_val_perf = val_perf test_loss_list = [ test_model_all( test_set_x[idx * batch_size:(idx + 1) * batch_size], test_set_y[idx * batch_size:(idx + 1) * batch_size], test_set_m[idx * batch_size:(idx + 1) * batch_size] ##mairesse_change ) for idx in range(int(test_batches)) ] if test_set_x.shape[0] > test_batches * batch_size: test_loss_list.append( test_model_all( test_set_x[int(test_batches * batch_size):], test_set_y[int(test_batches * batch_size):], test_set_m[int(test_batches * batch_size):] ##mairesse_change )) test_loss_list_temp = test_loss_list test_loss_list = np.asarray([t[:-1] for t in test_loss_list]) test_loss = np.sum(test_loss_list[:, 0]) / float( test_set_x.shape[0]) test_perf = 1 - test_loss tp = np.sum(test_loss_list[:, 1]) fp = np.sum(test_loss_list[:, 2]) fn = np.sum(test_loss_list[:, 3]) tn = test_set_x.shape[0] - (tp + fp + fn) fscore = np.mean([ 2 * tp / float(2 * tp + fp + fn), 2 * tn / float(2 * tn + fp + fn) ]) svm_test = np.concatenate([t[-1] for t in test_loss_list_temp], axis=0) svm_train = np.concatenate([t[1] for t in train_losses], axis=0) output = "Test result: accu: " + str( test_perf) + ", macro_fscore: " + str(fscore) + "\ntp: " + str( tp) + " tn:" + str(tn) + " fp: " + str(fp) + " fn: " + str( fn) print(output) ofile.write(output + "\n") ofile.flush() # dump train and test features pickle.dump(svm_test, open("cvte" + str(attr) + str(cv) + ".p", "wb")) pickle.dump(svm_train, open("cvtr" + str(attr) + str(cv) + ".p", "wb")) updated_epochs = refresh_epochs() if updated_epochs != None and n_epochs != updated_epochs: n_epochs = updated_epochs print('Epochs updated to ' + str(n_epochs)) return test_perf, fscore