def get_updates(nnet, train_obj, trainable_params): implemented_solvers = ("nesterov", "adagrad", "adadelta", "adam") if not hasattr(nnet, "solver") or nnet.solver not in implemented_solvers: nnet.sgd_solver = "nesterov" else: nnet.sgd_solver = nnet.solver if nnet.sgd_solver == "nesterov": updates = l_updates.nesterov_momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=0.9) elif nnet.sgd_solver == "adagrad": updates = l_updates.adagrad(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adadelta": updates = l_updates.adadelta(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adam": updates = l_updates.adam(train_obj, trainable_params, learning_rate=Cfg.learning_rate) return updates
def train(self, X_train, X_valid, early_stop_count = 20 , X_test = None): l2_norm_squared = 0.001*sum([layer.L2 for layer in self.layers]) mae = T.mean(T.sqrt(T.sum(T.sqr(self.layers[-1].output.flatten(2) - self.X), axis=1)), axis=0) cost = mae + l2_norm_squared updates = adadelta(cost,self.params) # updates = adam(cost, self.params) self.train_model = theano.function(inputs=[self.X], outputs=[cost, mae], updates=updates) self.valid_model = theano.function(inputs=[self.X], outputs=[cost, mae]) num_training_batches = int(X_train.shape[0] / self.mini_batch_size) num_validation_batches = int(X_valid.shape[0] / self.mini_batch_size) counter = 0 best_valid_err = 100 early_stop = early_stop_count epoch_i = 0 train_rand_idxs = list(range(0, X_train.shape[0])) valid_rand_idxs = list(range(0, X_valid.shape[0])) while counter < early_stop: epoch_i +=1 train_costs = [] train_errs = [] valid_costs = [] valid_errs = [] np.random.shuffle(train_rand_idxs) for batch_i in range(num_training_batches): mnb_X = X_train[train_rand_idxs[batch_i*self.mini_batch_size: batch_i*self.mini_batch_size + self.mini_batch_size]] train_cost, train_err = self.train_model(mnb_X) train_costs.append(train_cost) train_errs.append(train_err) np.random.shuffle(valid_rand_idxs) for batch_i in range(num_validation_batches): mnb_X = X_train[train_rand_idxs[batch_i*self.mini_batch_size: batch_i*self.mini_batch_size + self.mini_batch_size]] valid_cost, valid_err = self.valid_model(mnb_X) valid_costs.append(valid_cost) valid_errs.append(valid_err) train_err = np.mean(np.array(train_errs)) train_cost = np.mean(np.array(train_costs)) val_err = np.mean(np.array(valid_errs)) val_cost = np.mean(np.array(valid_costs)) if val_err < best_valid_err: best_valid_err = val_err sys.stdout.write("Epoch "+str(epoch_i)+" Train cost: "+ str(train_cost)+ "Train mae: "+ str(train_err) + " Validation cost: "+ str(val_cost)+" Validation mae "+ str(val_err) + ",counter "+str(counter)+ " __best__ \n") sys.stdout.flush() counter = 0 with open("model/" + self.name +".model", mode="wb") as f: cPickle.dump(self.params,f) else: counter +=1 sys.stdout.write("Epoch " + str(epoch_i)+" Train cost: "+ str(train_cost)+ "Train mae: "+ str(train_err) + " Validation cost: "+ str(val_cost)+" Validation mae "+ str(val_err) + ",counter "+str(counter) + "\n") sys.stdout.flush()
def generate_theano_func(args, network, penalty, input_dict, target_var): prediction = get_output(network, input_dict) # loss = T.mean( target_var * ( T.log(target_var) - prediction )) loss = T.mean(categorical_crossentropy(prediction, target_var)) # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) ) # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params ) # penalty = regularize_layer_params(l_forward_1_lstm, l2) # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params) # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) ) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, input_dict, deterministic=True) # test_prediction = get_output(network, deterministic=True) # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction)) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) train_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True, ) if args.task == "sts": val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True, ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True, ) return train_fn, val_fn
def __init__(self, output_size, meta_size, depth=2): encoder_sizes = [64, 64, 64] input_var = TT.matrix() meta_var = TT.matrix() target_var = TT.matrix() mask_var = TT.matrix() input_layer = layers.InputLayer((None, output_size), input_var=input_var) meta_layer = layers.InputLayer((None, meta_size), input_var=meta_var) concat_input_layer = layers.ConcatLayer([input_layer, meta_layer]) dense = concat_input_layer for idx in xrange(depth): dense = layers.DenseLayer(dense, encoder_sizes[idx]) dense = layers.batch_norm(dense) mu_and_logvar = layers.DenseLayer(dense, 2 * output_size, nonlinearity=nonlinearities.linear) mu = layers.SliceLayer(mu_and_logvar, slice(0, output_size), axis=1) log_var = layers.SliceLayer(mu_and_logvar, slice(output_size, None), axis=1) loss = neg_log_likelihood2( target_var, layers.get_output(mu), layers.get_output(log_var), mask_var ).mean() test_loss = neg_log_likelihood2( target_var, layers.get_output(mu, deterministic=True), layers.get_output(log_var, deterministic=True), mask_var ).mean() params = layers.get_all_params(mu_and_logvar, trainable=True) param_updates = updates.adadelta(loss, params) self._train_fn = theano.function( [input_var, meta_var, target_var], updates=param_updates, outputs=loss ) self._loss_fn = theano.function( [input_var, meta_var, target_var], outputs=test_loss ) self._predict_fn = theano.function( [input_var, meta_var], outputs=[ layers.get_output(mu, deterministic=True), layers.get_output(log_var, deterministic=True) ] )
def build_treatment_model(self, n_vars, **kwargs): input_vars = TT.matrix() instrument_vars = TT.matrix() targets = TT.vector() inputs = layers.InputLayer((None, n_vars), input_vars) inputs = layers.DropoutLayer(inputs, p=0.2) dense_layer = layers.DenseLayer(inputs, 2 * kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) dense_layer= layers.DropoutLayer(dense_layer, p=0.2) for _ in xrange(kwargs['n_dense_layers'] - 1): dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) self.treatment_output = layers.DenseLayer(dense_layer, 1, nonlinearity=nonlinearities.linear) init_params = layers.get_all_param_values(self.treatment_output) prediction = layers.get_output(self.treatment_output, deterministic=False) test_prediction = layers.get_output(self.treatment_output, deterministic=True) l2_cost = regularization.regularize_network_params(self.treatment_output, regularization.l2) loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost params = layers.get_all_params(self.treatment_output, trainable=True) param_updates = updates.adadelta(loss, params) self._train_fn = theano.function( [ input_vars, targets, instrument_vars, ], loss, updates=param_updates ) self._loss_fn = theano.function( [ input_vars, targets, instrument_vars, ], loss, ) self._output_fn = theano.function( [ input_vars, ], test_prediction, ) return init_params
def adadelta_momentum(grads, params, learning_rate=1.0, momentum=0.9, rho=0.95, epsilon=1e-06): return apply_nesterov_momentum(adadelta(grads, params, learning_rate, rho, epsilon), params=params, momentum=momentum)
def build_instrument_model(self, n_vars, **kwargs): targets = TT.vector() instrument_vars = TT.matrix() instruments = layers.InputLayer((None, n_vars), instrument_vars) instruments = layers.DropoutLayer(instruments, p=0.2) dense_layer = layers.DenseLayer(instruments, kwargs['dense_size'], nonlinearity=nonlinearities.tanh) dense_layer = layers.DropoutLayer(dense_layer, p=0.2) for _ in xrange(kwargs['n_dense_layers'] - 1): dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.tanh) dense_layer = layers.DropoutLayer(dense_layer, p=0.5) self.instrument_output = layers.DenseLayer( dense_layer, 1, nonlinearity=nonlinearities.linear) init_params = layers.get_all_param_values(self.instrument_output) prediction = layers.get_output(self.instrument_output, deterministic=False) test_prediction = layers.get_output(self.instrument_output, deterministic=True) # flexible here, endog variable can be categorical, continuous, etc. l2_cost = regularization.regularize_network_params( self.instrument_output, regularization.l2) loss = objectives.squared_error( prediction.flatten(), targets.flatten()).mean() + 1e-4 * l2_cost loss_total = objectives.squared_error(prediction.flatten(), targets.flatten()).mean() params = layers.get_all_params(self.instrument_output, trainable=True) param_updates = updates.adadelta(loss, params) self._instrument_train_fn = theano.function([ targets, instrument_vars, ], loss, updates=param_updates) self._instrument_loss_fn = theano.function([ targets, instrument_vars, ], loss_total) self._instrument_output_fn = theano.function([instrument_vars], test_prediction) return init_params
def run(self, parameter, parameterName, loss, **kwargs) : pVar = parameter.getVar() gparam = tt.grad(loss, pVar) updates = LUP.adadelta( [ gparam ], [pVar], learning_rate=self.getHP("lr"), rho=self.getHP("rho"), epsilon=self.getHP("epsilon")) ret = OptimizerResult(pVar, parameterName, gparam, updates[pVar]) i = 0 for param, update in updates.items() : if param is not pVar : name = "%s_adadelta_%s" % (parameterName, i) ret.addCoParameter(param, name, None, update) i += 1 return ret
def get_updates(nnet, train_obj, trainable_params, solver=None): implemented_solvers = ("sgd", "momentum", "nesterov", "adagrad", "rmsprop", "adadelta", "adam", "adamax") if solver not in implemented_solvers: nnet.sgd_solver = "adam" else: nnet.sgd_solver = solver if nnet.sgd_solver == "sgd": updates = l_updates.sgd(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "momentum": updates = l_updates.momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "nesterov": updates = l_updates.nesterov_momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "adagrad": updates = l_updates.adagrad(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "rmsprop": updates = l_updates.rmsprop(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adadelta": updates = l_updates.adadelta(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adam": updates = l_updates.adam(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adamax": updates = l_updates.adamax(train_obj, trainable_params, learning_rate=Cfg.learning_rate) return updates
def build_instrument_model(self, n_vars, **kwargs): targets = TT.vector() instrument_vars = TT.matrix() instruments = layers.InputLayer((None, n_vars), instrument_vars) instruments = layers.DropoutLayer(instruments, p=0.2) dense_layer = layers.DenseLayer(instruments, kwargs['dense_size'], nonlinearity=nonlinearities.tanh) dense_layer = layers.DropoutLayer(dense_layer, p=0.2) for _ in xrange(kwargs['n_dense_layers'] - 1): dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.tanh) dense_layer = layers.DropoutLayer(dense_layer, p=0.5) self.instrument_output = layers.DenseLayer(dense_layer, 1, nonlinearity=nonlinearities.linear) init_params = layers.get_all_param_values(self.instrument_output) prediction = layers.get_output(self.instrument_output, deterministic=False) test_prediction = layers.get_output(self.instrument_output, deterministic=True) # flexible here, endog variable can be categorical, continuous, etc. l2_cost = regularization.regularize_network_params(self.instrument_output, regularization.l2) loss = objectives.squared_error(prediction.flatten(), targets.flatten()).mean() + 1e-4 * l2_cost loss_total = objectives.squared_error(prediction.flatten(), targets.flatten()).mean() params = layers.get_all_params(self.instrument_output, trainable=True) param_updates = updates.adadelta(loss, params) self._instrument_train_fn = theano.function( [ targets, instrument_vars, ], loss, updates=param_updates ) self._instrument_loss_fn = theano.function( [ targets, instrument_vars, ], loss_total ) self._instrument_output_fn = theano.function([instrument_vars], test_prediction) return init_params
def train_setup(): x = T.tensor3('input') y = T.lvector('output') network = cnn(x, config.input_length, config.output_length) print 'Number of Parameters {0}'.format(count_params(network)) if config.init_model is not None: with np.load(config.init_model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] set_all_param_values(decoding, param_values) # training tasks in sequence prediction = get_output(network) ent = categorical_crossentropy(prediction, y) ent = ent.mean() l1_norm = config.l1_weight * regularize_network_params(network, l1) l2_norm = config.l2_weight * regularize_network_params(network, l2) total_error = ent + l1_norm + l2_norm params = get_all_params(network, trainable=True) updates = adadelta( total_error, params, config.learning_rate, \ config.rho, \ config.eps ) train_fn = function( [x, y], [ent, l1_norm, l2_norm, prediction], \ updates = updates, \ allow_input_downcast = True ) val_prediction = get_output(network, deterministic=True) val_ent = categorical_crossentropy(val_prediction, y) val_ent = val_ent.mean() val_fn = function([x, y], [val_ent, val_prediction], allow_input_downcast=True) return network, train_fn, val_fn
def train_setup(): x = T.tensor3('input') y = T.matrix('output') encoding, decoding = cnn( x, config.input_length, config.output_length, \ config.encoding_length ) print 'Number of Parameters {0}'.format(count_params(decoding)) if config.init_model is not None: with np.load(config.init_model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] set_all_param_values(decoding, param_values) # training tasks in sequence prediction = get_output(decoding) error = squared_error(y, prediction) error = error.mean() l1_norm = config.l1_weight * regularize_network_params(decoding, l1) l2_norm = config.l2_weight * regularize_network_params(decoding, l2) total_error = error + l1_norm + l2_norm params = get_all_params(decoding, trainable=True) updates = adadelta( total_error, params, config.learning_rate, \ config.rho, \ config.eps ) train_fn = function( [x, y], [error, l1_norm, l2_norm], \ updates = updates, \ allow_input_downcast = True ) val_prediction = get_output(decoding, deterministic=True) val_error = squared_error(y, val_prediction) val_error = val_error.mean() val_fn = function([x, y], val_error, allow_input_downcast=True) return encoding, decoding, train_fn, val_fn
def main(): def signal_handler(signal, frame): global terminate terminate = True print('terminating...'.format(terminate)) signal.signal(signal.SIGINT, signal_handler) configure_theano() options = parse_options() X, X_val = generate_data() # X = np.reshape(X, (-1, 1, 30, 40))[:-5] print('X type and shape:', X.dtype, X.shape) print('X.min():', X.min()) print('X.max():', X.max()) # X_val = np.reshape(X_val, (-1, 1, 30, 40))[:-1] print('X_val type and shape:', X_val.dtype, X_val.shape) print('X_val.min():', X_val.min()) print('X_val.max():', X_val.max()) # we need our target to be 1 dimensional X_out = X.reshape((X.shape[0], -1)) X_val_out = X_val.reshape((X_val.shape[0], -1)) print('X_out:', X_out.dtype, X_out.shape) print('X_val_out', X_val_out.dtype, X_val_out.shape) # X_noisy = apply_gaussian_noise(X_out) # visualize_reconstruction(X_noisy[0:25], X_out[0:25], shape=(28, 28)) # X = np.reshape(X_noisy, (-1, 1, 28, 28)) print('constructing and compiling model...') # input_var = T.tensor4('input', dtype='float32') input_var = T.tensor3('input', dtype='float32') target_var = T.matrix('output', dtype='float32') lr = theano.shared(np.array(0.8, dtype=theano.config.floatX), name='learning_rate') lr_decay = np.array(0.9, dtype=theano.config.floatX) # try building a reshaping layer # network = create_model(input_var, (None, 1, 30, 40), options) l_input = InputLayer((None, None, 1200), input_var, name='input') l_input = ReshapeLayer(l_input, (-1, 1, 30, 40), name='reshape_input') # l_input = InputLayer((None, 1, 30, 40), input_var, name='input') if options['MODEL'] == 'normal': network, encoder = avletters_convae.create_model(l_input, options) if options['MODEL'] == 'batchnorm': network, encoder = avletters_convae_bn.create_model(l_input, options) if options['MODEL'] == 'dropout': network, encoder = avletters_convae_drop.create_model(l_input, options) if options['MODEL'] == 'bn+dropout': network, encoder = avletters_convae_bndrop.create_model(l_input, options) print('AE Network architecture: {}'.format(options['MODEL'])) print_network(network) recon = las.layers.get_output(network, deterministic=False) all_params = las.layers.get_all_params(network, trainable=True) cost = T.mean(squared_error(recon, target_var)) updates = adadelta(cost, all_params, lr) # updates = las.updates.apply_nesterov_momentum(updates, all_params, momentum=0.90) use_max_constraint = False print('apply max norm constraint: {}'.format(use_max_constraint)) if use_max_constraint: MAX_NORM = 4 for param in las.layers.get_all_params(network, regularizable=True): if param.ndim > 1: # only apply to dimensions larger than 1, exclude biases # updates[param] = norm_constraint(param, MAX_NORM * las.utils.compute_norms(param.get_value()).mean()) updates[param] = norm_constraint(param, MAX_NORM) train = theano.function([input_var, target_var], recon, updates=updates, allow_input_downcast=True) train_cost_fn = theano.function([input_var, target_var], cost, allow_input_downcast=True) eval_recon = las.layers.get_output(network, deterministic=True) eval_cost = T.mean(las.objectives.squared_error(eval_recon, target_var)) eval_cost_fn = theano.function([input_var, target_var], eval_cost, allow_input_downcast=True) recon_fn = theano.function([input_var], eval_recon, allow_input_downcast=True) if terminate: exit() NUM_EPOCHS = options['NUM_EPOCHS'] EPOCH_SIZE = options['EPOCH_SIZE'] NO_STRIDES = options['NO_STRIDES'] VAL_NO_STRIDES = options['VAL_NO_STRIDES'] print('begin training for {} epochs...'.format(NUM_EPOCHS)) datagen = batch_iterator(X, X_out, 128) costs = [] val_costs = [] for epoch in range(NUM_EPOCHS): time_start = time.time() for i in range(EPOCH_SIZE): batch_X, batch_y = next(datagen) print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format( epoch + 1, i + 1, EPOCH_SIZE, len(batch_X), lr.get_value()) print(print_str, end='') sys.stdout.flush() batch_X = batch_X.reshape((-1, 1, 1200)) train(batch_X, batch_y) print('\r', end='') if terminate: break if terminate: break cost = batch_compute_cost(X, X_out, NO_STRIDES, train_cost_fn) val_cost = batch_compute_cost(X_val, X_val_out, VAL_NO_STRIDES, eval_cost_fn) costs.append(cost) val_costs.append(val_cost) print("Epoch {} train cost = {}, validation cost = {} ({:.1f}sec) " .format(epoch + 1, cost, val_cost, time.time() - time_start)) if epoch > 10: lr.set_value(lr.get_value() * lr_decay) X_val_recon = recon_fn(X_val) visualize_reconstruction(X_val_out[450:550], X_val_recon[450:550], shape=(30, 40), savefilename='avletters') plot_validation_cost(costs, val_costs, None, savefilename='valid_cost') conv2d1 = las.layers.get_all_layers(network)[2] visualize.plot_conv_weights(conv2d1, (15, 14)).savefig('conv2d1.png') print('saving encoder...') save_model(encoder, 'models/conv_encoder.dat') save_model(network, 'models/conv_ae.dat')
def main(): configure_theano() config_file = 'config/separate_train.ini' print('loading config file: {}'.format(config_file)) config = ConfigParser.ConfigParser() config.read(config_file) print('preprocessing dataset...') data = load_mat_file(config.get('data', 'images')) ae_pretrained = config.get('models', 'pretrained') ae_finetuned = config.get('models', 'finetuned') learning_rate = float(config.get('training', 'learning_rate')) decay_rate = float(config.get('training', 'decay_rate')) decay_start = int(config.get('training', 'decay_start')) lstm_units = int(config.get('training', 'lstm_units')) output_units = int(config.get('training', 'output_units')) do_finetune = config.getboolean('training', 'do_finetune') save_finetune = config.getboolean('training', 'save_finetune') load_finetune = config.getboolean('training', 'load_finetune') # 53 subjects, 70 utterances, 5 view angles # s[x]_v[y]_u[z].mp4 # resized, height, width = (26, 44) # ['dataMatrix', 'targetH', 'targetsPerVideoVec', 'videoLengthVec', '__header__', 'targetsVec', # '__globals__', 'iterVec', 'filenamesVec', 'dataMatrixCells', 'subjectsVec', 'targetW', '__version__'] print(data.keys()) X = data['dataMatrix'].astype('float32') # .reshape((-1, 26, 44), order='f').reshape((-1, 26 * 44)) y = data['targetsVec'].astype('int32') y = y.reshape((len(y),)) uniques = np.unique(y) print('number of classifications: {}'.format(len(uniques))) subjects = data['subjectsVec'].astype('int') subjects = subjects.reshape((len(subjects),)) video_lens = data['videoLengthVec'].astype('int') video_lens = video_lens.reshape((len(video_lens,))) train_subject_ids = read_data_split_file('data/train.txt') val_subject_ids = read_data_split_file('data/val.txt') test_subject_ids = read_data_split_file('data/test.txt') print('Train: {}'.format(train_subject_ids)) print('Validation: {}'.format(val_subject_ids)) print('Test: {}'.format(test_subject_ids)) train_X, train_y, train_vidlens, train_subjects, \ val_X, val_y, val_vidlens, val_subjects, \ test_X, test_y, test_vidlens, test_subjects = \ split_data(X, y, subjects, video_lens, train_subject_ids, val_subject_ids, test_subject_ids) assert train_X.shape[0] + val_X.shape[0] + test_X.shape[0] == len(X) assert train_y.shape[0] + val_y.shape[0] + test_y.shape[0] == len(y) assert train_vidlens.shape[0] + val_vidlens.shape[0] + test_vidlens.shape[0] == len(video_lens) assert train_subjects.shape[0] + val_subjects.shape[0] + test_subjects.shape[0] == len(subjects) train_X = normalize_input(train_X, centralize=True) test_X = normalize_input(test_X, centralize=True) if do_finetune: dbn = load_dbn(ae_pretrained) dbn.initialize() dbn.fit(train_X, train_X) recon = dbn.predict(test_X) visualize_reconstruction(reorder_data(test_X[800:864], (26, 44)), reorder_data(recon[800:864], (26, 44)), shape=(26, 44)) if save_finetune: pickle.dump(dbn, open(ae_finetuned, 'wb')) if load_finetune: print('loading pre-trained encoding layers...') dbn = pickle.load(open(ae_finetuned, 'rb')) dbn.initialize() # recon = dbn.predict(test_X) # visualize_reconstruction(reorder_data(test_X[800:864], (26, 44)), # reorder_data(recon[800:864], (26, 44)), # shape=(26, 44)) encoder = extract_encoder(dbn) train_X = encoder.predict(train_X) val_X = encoder.predict(val_X) test_X = encoder.predict(test_X) # train_X = concat_first_second_deltas(train_X, train_vidlens) # val_X = concat_first_second_deltas(val_X, val_vidlens) # test_X = concat_first_second_deltas(test_X, test_vidlens) # featurewise normalize train_X, mean, std = featurewise_normalize_sequence(train_X) val_X = (val_X - mean) / std test_X = (test_X - mean) / std # recon = dbn.predict(test_X) # visualize_reconstruction(test_X[550:650], recon[550:650], (26, 44)) # exit() # IMPT: the encoder was trained with fortan ordered images, so to visualize # convert all the images to C order using reshape_images_order() # output = dbn.predict(test_X) # test_X = reshape_images_order(test_X, (26, 44)) # output = reshape_images_order(output, (26, 44)) # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44)) inputs = T.tensor3('inputs', dtype='float32') mask = T.matrix('mask', dtype='uint8') targets = T.ivector('targets') lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX), name='learning_rate') lr_decay = np.array(decay_rate, dtype=theano.config.floatX) print('constructing lstm classifier...') network = lstm_classifier_baseline.create_model((None, None, 50), inputs, (None, None), mask, lstm_units, output_units) print_network(network) print('compiling model...') predictions = las.layers.get_output(network, deterministic=False) all_params = las.layers.get_all_params(network, trainable=True) cost = T.mean(las.objectives.categorical_crossentropy(predictions, targets)) updates = adadelta(cost, all_params, learning_rate=lr) # updates = las.updates.apply_momentum(sgd(cost, all_params, learning_rate=lr), all_params, 0.1) use_max_constraint = False if use_max_constraint: MAX_NORM = 4 for param in las.layers.get_all_params(network, regularizable=True): if param.ndim > 1: # only apply to dimensions larger than 1, exclude biases updates[param] = norm_constraint(param, MAX_NORM * las.utils.compute_norms(param.get_value()).mean()) train = theano.function( [inputs, targets, mask], cost, updates=updates, allow_input_downcast=True) compute_train_cost = theano.function([inputs, targets, mask], cost, allow_input_downcast=True) test_predictions = las.layers.get_output(network, deterministic=True) test_cost = T.mean(las.objectives.categorical_crossentropy(test_predictions, targets)) compute_test_cost = theano.function( [inputs, targets, mask], test_cost, allow_input_downcast=True) val_fn = theano.function([inputs, mask], test_predictions, allow_input_downcast=True) # We'll train the network with 10 epochs of 30 minibatches each print('begin training...') cost_train = [] cost_val = [] class_rate = [] NUM_EPOCHS = 30 EPOCH_SIZE = 120 BATCH_SIZE = 10 STRIP_SIZE = 3 MAX_LOSS = 0.2 VALIDATION_WINDOW = 10 val_window = circular_list(VALIDATION_WINDOW) train_strip = np.zeros((STRIP_SIZE,)) best_val = float('inf') best_conf = None best_cr = 0.0 datagen = gen_lstm_batch_random(train_X, train_y, train_vidlens, batchsize=BATCH_SIZE) val_datagen = gen_lstm_batch_random(val_X, val_y, val_vidlens, batchsize=len(val_vidlens)) test_datagen = gen_lstm_batch_random(test_X, test_y, test_vidlens, batchsize=len(test_vidlens)) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val, _ = next(val_datagen) X_test, y_test, mask_test, _ = next(test_datagen) def early_stop(cost_window): if len(cost_window) < 2: return False else: curr = cost_window[0] for idx, cost in enumerate(cost_window): if curr < cost or idx == 0: curr = cost else: return False return True for epoch in range(NUM_EPOCHS): time_start = time.time() for i in range(EPOCH_SIZE): X, y, m, _ = next(datagen) print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format( epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value())) print(print_str, end='') sys.stdout.flush() train(X, y, m) print('\r', end='') cost = compute_train_cost(X, y, m) val_cost = compute_test_cost(X_val, y_val, mask_val) cost_train.append(cost) cost_val.append(val_cost) train_strip[epoch % STRIP_SIZE] = cost val_window.push(val_cost) gl = 100 * (cost_val[-1] / np.min(cost_val) - 1) pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1) pq = gl / pk cr, val_conf = evaluate_model(X_val, y_val, mask_val, val_fn) class_rate.append(cr) if val_cost < best_val: best_val = val_cost best_conf = val_conf best_cr = cr test_cr, test_conf = evaluate_model(X_test, y_test, mask_test, val_fn) print("Epoch {} train cost = {}, val cost = {}, " "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f}, Test CR= {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, test_cr, time.time() - time_start)) else: print("Epoch {} train cost = {}, val cost = {}, " "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start)) if epoch >= VALIDATION_WINDOW and early_stop(val_window): break # learning rate decay if epoch > decay_start: lr.set_value(lr.get_value() * lr_decay) phrases = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10'] print('Final Model') print('CR: {}, val loss: {}, Test CR: {}'.format(best_cr, best_val, test_cr)) print('confusion matrix: ') plot_confusion_matrix(test_conf, phrases, fmt='grid') plot_validation_cost(cost_train, cost_val, savefilename='valid_cost')
def build_network_2dconv(args, input_var, target_var, wordEmbeddings, maxlen=60): print("Building model with 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] num_filters = 100 stride = 1 # CNN_sentence config filter_size = (3, wordDim) pool_size = (maxlen - 3 + 1, 1) input = InputLayer((None, maxlen), input_var=input_var) batchsize, seqlen = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb.params[emb.W].remove("trainable") # (batchsize, maxlen, wordDim) reshape = ReshapeLayer(emb, (batchsize, 1, maxlen, wordDim)) conv2d = Conv2DLayer( reshape, num_filters=num_filters, filter_size=(filter_size), stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool = MaxPool2DLayer(conv2d, pool_size=pool_size) # (None, 100, 1, 1) forward = FlattenLayer(maxpool) # (None, 100) #(None, 50400) hid = DenseLayer(forward, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = {conv2d: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction, target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn
def __init__(self, hidden_size=100, nclasses=73, num_embeddings=11359, embedding_dim=100, window_size=1, memory_size=40, n_memory_slots=8, go_code=1, depth=2, load_dir=None): articles, titles = T.imatrices('articles', 'titles') n_article_slots = int(n_memory_slots / 2) # TODO derive this from an arg n_title_slots = n_memory_slots - n_article_slots n_instances = articles.shape[0] self.window_size = window_size randoms = { # attr: shape # 'emb': (num_embeddings + 1, embedding_dim), 'M_a': (memory_size, n_article_slots), 'M_t': (memory_size, n_title_slots), 'w_a': (n_article_slots, ), 'w_t': (n_title_slots, ), 'Wg_a': (window_size * embedding_dim, n_article_slots), 'Wg_t': (window_size * embedding_dim, n_title_slots), 'Wk': (hidden_size, memory_size), 'Wb': (hidden_size, 1), 'Wv': (hidden_size, memory_size), 'We_a': (hidden_size, n_article_slots), 'We_t': (hidden_size, n_title_slots), 'Wx': (window_size * embedding_dim, hidden_size), 'Wh': (memory_size, hidden_size), 'W': (hidden_size, nclasses), 'h0': hidden_size } zeros = { # attr: shape 'bg_a': n_article_slots, 'bg_t': n_title_slots, 'bk': memory_size, 'bb': 1, 'bv': memory_size, 'be_a': n_article_slots, 'be_t': n_title_slots, 'bh': hidden_size, 'b': nclasses, } for l in range(depth): randoms['gru' + str(l)] = (1, embedding_dim) def random_shared(name): shape = randoms[name] return theano.shared( 0.2 * np.random.normal(size=shape).astype(theano.config.floatX), name=name) def zeros_shared(name): shape = zeros[name] return theano.shared(np.zeros(shape, dtype=theano.config.floatX), name=name) for key in randoms: # create an attribute with associated shape and random values setattr(self, key, random_shared(key)) for key in zeros: # create an attribute with associated shape and values equal to 0 setattr(self, key, zeros_shared(key)) self.names = randoms.keys() + zeros.keys() # self.names.remove('emb') # no need to save or update embeddings scan_vars = 'h0 w_a M_a w_t M_t'.split() def repeat_for_each_instance(param): """ repeat param along new axis once for each instance """ return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0) for key in scan_vars: setattr(self, key, repeat_for_each_instance(self.__getattribute__(key))) self.names.remove(key) if load_dir is not None: with open(os.path.join(load_dir, 'params.pkl')) as handle: params = pickle.load(handle) self.__dict__.update(params) def recurrence(i, h_tm1, w_a, M_a, *args, **kwargs): """ notes Headers from paper in all caps mem = n_article slots if is_article else n_title_slots :param i: center index of sliding window :param h_tm1: h_{t-1} (hidden state) :param w_a: attention weights for article memory :param M_a: article memory :param args: gru_weights, maybe w_t, maybe M_t gru_weights: weights with which to initialize GRULayer on each time step w_t: attention weights for titles memory M_t: titles memory :param kwargs: is_training, is_article is_training: is_article: we use different parts of memory when working with a article :return: [y = model outputs, i + 1 = increment index, h w, M (see above)] """ is_training = kwargs['is_training'] is_article = kwargs['is_article'] gru_weights = args[:depth] if len(args) > depth: w_t = args[depth] M_t = args[depth + 1] i_type = T.iscalar if is_article or is_training else T.ivector assert i.type == i_type if not is_article: assert w_t is not None and M_t is not None word_idxs = i if is_article or is_training: # get representation of word window document = articles if is_article else titles # [instances, bucket_width] word_idxs = document[:, i:i + 1] # [instances, 1] # x_i = self.emb[word_idxs].flatten(ndim=2) # [instances, embedding_dim] input = InputLayer(shape=(None, 1), input_var=word_idxs) embed = EmbeddingLayer(input, num_embeddings, embedding_dim) gru = GRULayer(incoming=embed, num_units=embedding_dim, hid_init=self.gru0) for weight in gru_weights: gru = GRULayer(incoming=gru, num_units=embedding_dim, hid_init=weight) x_i = get_output(gru).flatten(ndim=2) x_i = Print('x_i')(x_i) # [instances, embedding_dim] gru_weights = [] if is_article: M_read = M_a # [instances, memory_size, n_article_slots] w_read = w_a # [instances, n_article_slots] else: M_read = T.concatenate( [M_a, M_t], axis=2) # [instances, memory_size, n_title_slots] w_read = T.concatenate([w_a, w_t], axis=1) # [instances, n_title_slots] # eqn 15 c = T.batched_dot(M_read, w_read) # [instances, memory_size] # EXTERNAL MEMORY READ def get_attention(Wg, bg, M, w): g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg) # [instances, mem] # eqn 11 k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # eqn 13 beta = T.dot(h_tm1, self.Wb) + self.bb beta = T.nnet.softplus(beta) beta = T.addbroadcast(beta, 1) # [instances, 1] # eqn 12 w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # eqn 14 return (1 - g) * w + g * w_hat # [instances, mem] w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a) # [instances, n_article_slots] if not is_article: w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t) # [instances, n_title_slots] # MODEL INPUT AND OUTPUT # eqn 9 h = T.dot(c, self.Wh) + T.dot( x_i, self.Wx) + self.bh # [instances, hidden_size] # eqn 10 y = T.nnet.softmax(T.dot(h, self.W) + self.b) # [instances, nclasses] # EXTERNAL MEMORY UPDATE def update_memory(We, be, w_update, M_update): # eqn 17 e = T.nnet.sigmoid(T.dot(h_tm1, We) + be) # [instances, mem] f = 1. - w_update * e # [instances, mem] # eqn 16 v = T.tanh(T.dot(h, self.Wv) + self.bv) # [instances, memory_size] # need to add broadcast layers for memory update f = f.dimshuffle(0, 'x', 1) # [instances, 1, mem] u = w_update.dimshuffle(0, 'x', 1) # [instances, 1, mem] v = v.dimshuffle(0, 1, 'x') # [instances, memory_size, 1] # eqn 19 return M_update * f + T.batched_dot(v, u) * ( 1 - f) # [instances, memory_size, mem] M_a = update_memory(self.We_a, self.be_a, w_a, M_a) attention_and_memory = [w_a, M_a] if not is_article: M_t = update_memory(self.We_t, self.be_t, w_t, M_t) attention_and_memory += [w_t, M_t] y_max = y.argmax(axis=1).astype(int32) next_idxs = i + 1 if is_training or is_article else y_max return [y, y_max, next_idxs, h] + attention_and_memory read_article = partial(recurrence, is_training=True, is_article=True) # for read_article, it actually doesn't matter whether is_training is true i0 = T.constant(0, dtype=int32, name='first_value_of_i') gru_weights = [eval('self.gru' + str(l)) for l in range(depth)] outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a ] + gru_weights [_, _, _, h, w, M], _ = theano.scan(fn=read_article, outputs_info=outputs_info, n_steps=articles.shape[1], name='read_scan') produce_title = partial(recurrence, is_training=True, is_article=False) outputs_info[3:6] = [param[-1, :, :] for param in (h, w, M)] outputs_info.extend([self.w_t, self.M_t]) bucket_width = titles.shape[ 1] - 1 # subtract 1 because <go> is omitted in y_true [y, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title, outputs_info=outputs_info, n_steps=bucket_width, name='train_scan') # loss and updates y_clip = T.clip(y, .01, .99) y_flatten = y_clip.dimshuffle(2, 1, 0).flatten(ndim=2).T y_true = titles[:, 1:].ravel() # [:, 1:] in order to omit <go> counts = T.extra_ops.bincount(y_true, assert_nonneg=True) weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0) losses = T.nnet.categorical_crossentropy(y_flatten, y_true) loss = objectives.aggregate(losses, weights, mode='sum') updates = adadelta(loss, self.params()) self.learn = theano.function(inputs=[articles, titles], outputs=[y_max.T, loss], updates=updates, allow_input_downcast=True, name='learn') produce_title_test = partial(recurrence, is_training=False, is_article=False) self.test = theano.function(inputs=[articles, titles], outputs=[y_max.T], on_unused_input='ignore') outputs_info[2] = T.zeros([n_instances], dtype=int32) + go_code [_, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title_test, outputs_info=outputs_info, n_steps=bucket_width, name='test_scan') self.predict = theano.function(inputs=[articles, titles], outputs=y_max.T, name='infer')
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats): print("Building model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen-kw+1 stride = 1 #important context words as channels #CNN_sentence config filter_size=wordDim pool_size=seqlen-filter_size+1 input = InputLayer((None, seqlen, num_feats),input_var=input_var) batchsize, _, _ = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim) #print get_output_shape(emb) reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim)) #print get_output_shape(reshape) conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1 #print get_output_shape(conv1d) conv1d = DimshuffleLayer(conv1d, (0,2,1)) #print get_output_shape(conv1d) pool_size=num_filters maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) #print get_output_shape(maxpool) #forward = FlattenLayer(maxpool) #print get_output_shape(forward) hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
def build_chain_trainer(self): bs = self.bs td = self.td wi = T.ivector('wi') # bs (disamb. word indices) nwi = T.ivector('nwi') # negative samples lr = T.dscalar('lr').astype(theano.config.floatX) # learning rate lam = T.dscalar('lam').astype(theano.config.floatX) L = self.params['L'] L1 = self.params['L1'] # hd x td #Wt = self.params['Wt'] if not self.hinge_cost: L2 = self.params['L2'] B = self.params['B'] # td B2 = self.params['B2'] dwe = self.params['dwe'] df = self.dat[wi, :] #T.itensor3('df')# bs x mw x ms pr = self.sense_priors[wi, :] # bs x mw x ms mk = self.dmask[wi, :] #T.itensor3('mk')# bs x mw x ms pd = self.pd[ wi, :] #T.imatrix('pd') # bs x mdw (plain definition sentence) pe = self.ex[wi, :] # plain example sentences bs x mew dw = dwe[wi, :] # bs x td msk = self.wmask[wi, :].dimshuffle(0, 1, 'x') # bs x mw x 1 ndw = dwe[nwi, :] # negative words def to_vect(d, m, p): hid_inp = dwe[d, :] # mw x ms x hd logit = T.exp(T.dot(hid_inp, L0)[:, :, p]) # (mw x ms) x mw mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m) mask = mk.dimshuffle(0, 'x', 'x') l2 = logit * mask # mw x ms x mw l2 = T.sum(l2 * mk.dimshuffle('x', 'x', 0), axis=2) * m # mw x ms w0 = l2 / T.sum(l2, axis=1).dimshuffle(0, 'x') w1 = T.switch(T.isnan(w0), 0, w0) w = w1.dimshuffle(0, 1, 'x') # mw x ms x 1 res = T.sum(w * hid_inp, axis=1) # mw x hd return res #, logit, weights def to_weight(d, m, p, prior): logit = T.tensordot(dwe[d, :], dwe.T, axes=1)[:, :, d] # mw x ms x mw x ms cnt = T.sum(m, axis=1).dimshuffle('x', 'x', 0) # 1 x 1 x mw logit = T.sum(logit * m.dimshuffle('x', 'x', 0, 1), axis=3) / cnt # mw x ms x mw logit = T.exp(10 * T.switch(T.isnan(logit), 0, logit)) # mw x ms x mw logit = T.prod(logit, axis=2) * prior # mw x ms sm = T.sum(logit * m, axis=1, keepdims=True) # mw x 1 #mask = T.switch(T.lt(p, 0), 0, 1).dimshuffle(0, 'x') # logit = (logit * m) / sm # mw x ms return T.switch(T.or_(T.isnan(logit), T.isinf(logit)), 0, logit) '''def to_weight(d, m, p, prior): A = dwe[d, :] # mw x ms x td #tmp = T.tensordot(T.dot(A, Wt), A.T, axes=1) # mw x ms x ms x mw #B = A * Wt.dimshuffle('x', 'x', 0) # 'diag' setting #tmp = T.tensordot(B, B.T, axes = 1) tmp = T.tensordot(A, A.T, axes = 1) # 'iden' setting tmp = T.exp(1000 * tmp.dimshuffle(0, 1, 3, 2)) # mw x ms x mw x ms tmp = tmp * m.dimshuffle('x', 'x', 0, 1) nrm = T.sum(tmp, axis=3) tmp = tmp / nrm.dimshuffle(0, 1, 2, 'x') tmp = T.switch(T.isnan(tmp), 0, tmp) mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m) tmp = T.max(tmp, axis=3) * mk.dimshuffle('x', 'x', 0) # mw x ms x mw tmp = T.exp(T.sum(T.log(T.switch(T.eq(tmp, 0), 1, tmp)), axis=2)) * m # mw x ms tmp = tmp * prior tmp = tmp / T.sum(tmp, axis=1).dimshuffle(0, 'x') return T.switch(T.isnan(tmp), 0, tmp)''' def cosim(x, y): return T.mean( T.sum(x * y, axis=1) / (x.norm(2, axis=1) * y.norm(2, axis=1))) #dat, _ = theano.scan(fn=to_vect, sequences=[df, mk, pd]) # bs x mw x td #ndat, _ = theano.scan(fn=to_vect_tmp, sequences=[ndf, nmk, npd]) # bs x mw x td weights, _ = theano.scan(fn=to_weight, sequences=[df, mk, pd, pr]) # bs x mw x ms hid_inp = dwe[df, :] # bs x mw x ms x td dat = T.sum(weights.dimshuffle(0, 1, 2, 'x') * hid_inp, axis=2) # bs x mw x td ''' inp = dat.astype(theano.config.floatX) def_emb = T.sum(T.dot(inp, L) * msk, axis=1) # bs x hd #neg_inp = ndat.astype(theano.config.floatX) #def_emb = get_sentence(inp, msk) # bs x hd #neg_def_emb = get_sentence(neg_inp, neg_msk) #w_cost = T.sum((def_emb - dw) ** 2) #w_neg_cost = T.sum((def_emb - ndw) ** 2) if self.hinge_cost: def_emb = T.dot(def_emb, L1) w_cost = -cosim(def_emb, dw) rep = nwi.shape[0] / wi.shape[ 0] # b/c there are more negative samples than pos. de = T.extra_ops.repeat(def_emb, rep, axis=0) w_neg_cost = -cosim(de, ndw) cost = T.mean(T.maximum(0, 0.01 + w_cost - w_neg_cost)) # hingeloss else: regress = T.dot(T.nnet.sigmoid(T.dot(def_emb, L1) + B), L2) + B2 # bs x td cost = T.mean( (regress - dw)** 2) + 0.01 * T.sum(abs(L2)) # only regularize the last if self.reg_alpha: cost += 0.1 * T.sum(abs(weights)) #w_cost = get_word_probs(def_emb, wi, L1) #dwe.T) # dwe instead of L1 #w_neg_cost = get_word_probs(def_emb, nwi, L1) #dwe.T) # dwe instead of L1 #c_cost = -get_context_probs(def_emb, pe, L0) # negative of the likelihood #c_neg_cost = -get_context_probs(def_emb, npe, L0) #all_params = [self.params[k] for k in self.params if k != 'dwe' and not k.startswith('L')] all_params = [self.params[k] for k in self.params if k != 'dwe'] #L_params = [L0] '''Copy of the same function in Lasagne (with minor changes)''' def apply_nesterov_momentum(ups, mom, shape=None): params = ups.keys() ups = OrderedDict(ups) if shape is None: shape = [p.get_value(borrow=True).shape for p in params] for (param, shp) in zip(params, shape): velocity = theano.shared(np.zeros(shp, dtype=theano.config.floatX), broadcastable=param.broadcastable) x = mom * velocity + ups[param] - param ups[velocity] = x ups[param] = mom * x + ups[param] return ups dwe_params = [dw, ndw] if self.do_sgd: grads = T.grad(cost, all_params) updates = OrderedDict() for (p, g) in zip(all_params, grads): updates[p] = p - lr * g apply_nesterov_momentum(updates, mom=0.9) if self.no_alt or not self.do_fixedpoint: dgrads = T.grad(cost, dwe_params) dwe_update = OrderedDict() for (p, g) in zip(dwe_params, dgrads): dwe_update[p] = p - lr * g foo = lr * g apply_nesterov_momentum(dwe_update, mom=0.9, shape=[(bs, td), (bs, td)]) else: updates = adadelta(cost, all_params, learning_rate=lr) #L_update = adadelta(cost, L_params, learning_rate = lr) if self.no_alt or not self.do_fixedpoint: dwe_update = adadelta(cost, dwe_params, learning_rate=lr) if not self.no_alt and self.do_fixedpoint: # because no alternating training means optimization if self.do_rw: #posword = self.base[wi] + 0.3 * def_emb #0.3 * ((1 - self.lam) * def_emb + self.lam * dw) idf = self.idf[wi].dimshuffle( 0, 1, 'x') # bs x mw x 1 (dat is bs x mw x hd) rw_term = T.sum(dat * idf, axis=1) # bs x hd disc_fact = 0.9 if self.init_dwe: #posword = disc_fact * rw_term # + self.base[wi] # truerw posword = ( 1 - lam ) * dw + lam * disc_fact * rw_term # + self.base[wi] # truerw else: base = self.lam * def_emb + (1 - self.lam) * dw posword = base + disc_fact * rw_term word_update = T.set_subtensor( dw, posword.astype(theano.config.floatX)) dwe_update = {dwe: word_update} dwe_ret = T.max(T.abs_(posword - dw)) # max-norm of the increment else: posword = (1 - self.lam) * def_emb + self.lam * dw word_update = T.set_subtensor(dw, posword - self.lam * ndw) dwe_update = {dwe: word_update} dwe_ret = word_update else: #elif not self.do_fixedpoint or self.no_alt: word_update = dwe_update[dw] word_update = T.set_subtensor(dw, word_update) nword_update = dwe_update[ndw] word_update = T.set_subtensor(word_update[nwi, :], nword_update) dwe_update = {dwe: word_update} #T.set_subtensor(dw, word_update) if self.no_alt: updates.update({dwe: word_update}) dwe_ret = word_update #updates.update({dwe: dwe_update[dwe]}) #word_update}) #updates.update({dwe: word_update}) self.train_step = theano.function([wi, nwi, lr], [cost, weights], updates=updates) if not self.no_alt: self.dwe_train_step = theano.function([wi, nwi, lam], [cost, dwe_ret, weights], updates=dwe_update)
def main(): configure_theano() options = parse_options() config_file = options['config'] config = ConfigParser.ConfigParser() config.read(config_file) print('CLI options: {}'.format(options.items())) print('Reading Config File: {}...'.format(config_file)) print(config.items('data')) print(config.items('models')) print(config.items('training')) print('preprocessing dataset...') data = load_mat_file(config.get('data', 'images')) dct_data = load_mat_file(config.get('data', 'dct')) ae_finetuned = config.get('models', 'finetuned') ae_finetuned_diff = config.get('models', 'finetuned_diff') fusiontype = config.get('models', 'fusiontype') learning_rate = float(config.get('training', 'learning_rate')) decay_rate = float(config.get('training', 'decay_rate')) decay_start = int(config.get('training', 'decay_start')) load_finetune = config.getboolean('training', 'load_finetune') load_finetune_diff = config.getboolean('training', 'load_finetune_diff') train_vidlens = data['trVideoLengthVec'].astype('int').reshape((-1, )) val_vidlens = data['valVideoLengthVec'].astype('int').reshape((-1, )) test_vidlens = data['testVideoLengthVec'].astype('int').reshape((-1, )) train_X = data['trData'].astype('float32') val_X = data['valData'].astype('float32') test_X = data['testData'].astype('float32') train_dct = dct_data['trDctFeatures'].astype('float32') val_dct = dct_data['valDctFeatures'].astype('float32') test_dct = dct_data['testDctFeatures'].astype('float32') train_X_diff = compute_diff_images(train_X, train_vidlens) val_X_diff = compute_diff_images(val_X, val_vidlens) test_X_diff = compute_diff_images(test_X, test_vidlens) train_y = data['trTargetsVec'].astype('int').reshape( (-1, )) + 1 # +1 to handle the -1 introduced in lstm_gendata val_y = data['valTargetsVec'].astype('int').reshape((-1, )) + 1 test_y = data['testTargetsVec'].astype('int').reshape((-1, )) + 1 # featurewise normalize dct features train_dct, dct_mean, dct_std = featurewise_normalize_sequence(train_dct) val_dct = (val_dct - dct_mean) / dct_std test_dct = (test_dct - dct_mean) / dct_std if load_finetune: print('loading finetuned encoder: {}...'.format(ae_finetuned)) ae = pickle.load(open(ae_finetuned, 'rb')) ae.initialize() if load_finetune_diff: print('loading finetuned encoder: {}...'.format(ae_finetuned_diff)) ae_diff = pickle.load(open(ae_finetuned_diff, 'rb')) ae_diff.initialize() # IMPT: the encoder was trained with fortan ordered images, so to visualize # convert all the images to C order using reshape_images_order() # output = dbn.predict(test_X) # test_X = reshape_images_order(test_X, (26, 44)) # output = reshape_images_order(output, (26, 44)) # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44)) window = T.iscalar('theta') dct = T.tensor3('dct', dtype='float32') inputs = T.tensor3('inputs', dtype='float32') inputs_diff = T.tensor3('inputs_diff', dtype='float32') mask = T.matrix('mask', dtype='uint8') targets = T.ivector('targets') lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX), name='learning_rate') lr_decay = np.array(decay_rate, dtype=theano.config.floatX) print('constructing end to end model...') network, l_fuse = adenet_v3.create_model(ae, ae_diff, (None, None, 1500), inputs, (None, None), mask, (None, None, 90), dct, (None, None, 1500), inputs_diff, 250, window, 10, fusiontype) print_network(network) # draw_to_file(las.layers.get_all_layers(network), 'network.png') print('compiling model...') predictions = las.layers.get_output(network, deterministic=False) all_params = las.layers.get_all_params(network, trainable=True) cost = T.mean(las.objectives.categorical_crossentropy( predictions, targets)) updates = adadelta(cost, all_params, learning_rate=lr) # updates = adagrad(cost, all_params, learning_rate=lr) train = theano.function([inputs, targets, mask, dct, inputs_diff, window], cost, updates=updates, allow_input_downcast=True) compute_train_cost = theano.function( [inputs, targets, mask, dct, inputs_diff, window], cost, allow_input_downcast=True) test_predictions = las.layers.get_output(network, deterministic=True) test_cost = T.mean( las.objectives.categorical_crossentropy(test_predictions, targets)) compute_test_cost = theano.function( [inputs, targets, mask, dct, inputs_diff, window], test_cost, allow_input_downcast=True) val_fn = theano.function([inputs, mask, dct, inputs_diff, window], test_predictions, allow_input_downcast=True) # We'll train the network with 10 epochs of 30 minibatches each print('begin training...') cost_train = [] cost_val = [] class_rate = [] NUM_EPOCHS = 30 EPOCH_SIZE = 45 BATCH_SIZE = 20 WINDOW_SIZE = 9 STRIP_SIZE = 3 MAX_LOSS = 0.2 VALIDATION_WINDOW = 4 val_window = circular_list(VALIDATION_WINDOW) train_strip = np.zeros((STRIP_SIZE, )) best_val = float('inf') best_conf = None best_cr = 0.0 datagen = gen_lstm_batch_random(train_X, train_y, train_vidlens, batchsize=BATCH_SIZE) integral_lens = compute_integral_len(train_vidlens) val_datagen = gen_lstm_batch_random(val_X, val_y, val_vidlens, batchsize=len(val_vidlens)) test_datagen = gen_lstm_batch_random(test_X, test_y, test_vidlens, batchsize=len(test_vidlens)) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val, idxs_val = next(val_datagen) integral_lens_val = compute_integral_len(val_vidlens) dct_val = gen_seq_batch_from_idx(val_dct, idxs_val, val_vidlens, integral_lens_val, np.max(val_vidlens)) X_diff_val = gen_seq_batch_from_idx(val_X_diff, idxs_val, val_vidlens, integral_lens_val, np.max(val_vidlens)) # we use the test set to check final classification rate X_test, y_test, mask_test, idxs_test = next(test_datagen) integral_lens_test = compute_integral_len(test_vidlens) dct_test = gen_seq_batch_from_idx(test_dct, idxs_test, test_vidlens, integral_lens_test, np.max(test_vidlens)) X_diff_test = gen_seq_batch_from_idx(test_X_diff, idxs_test, test_vidlens, integral_lens_test, np.max(test_vidlens)) def early_stop(cost_window): if len(cost_window) < 2: return False else: curr = cost_window[0] for idx, cost in enumerate(cost_window): if curr < cost or idx == 0: curr = cost else: return False return True for epoch in range(NUM_EPOCHS): time_start = time.time() for i in range(EPOCH_SIZE): X, y, m, batch_idxs = next(datagen) d = gen_seq_batch_from_idx(train_dct, batch_idxs, train_vidlens, integral_lens, np.max(train_vidlens)) X_diff = gen_seq_batch_from_idx(train_X_diff, batch_idxs, train_vidlens, integral_lens, np.max(train_vidlens)) print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format( epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value())) print(print_str, end='') sys.stdout.flush() train(X, y, m, d, X_diff, WINDOW_SIZE) print('\r', end='') cost = compute_train_cost(X, y, m, d, X_diff, WINDOW_SIZE) val_cost = compute_test_cost(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE) cost_train.append(cost) cost_val.append(val_cost) train_strip[epoch % STRIP_SIZE] = cost val_window.push(val_cost) gl = 100 * (cost_val[-1] / np.min(cost_val) - 1) pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1) pq = gl / pk cr, val_conf = evaluate_model(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE, val_fn) class_rate.append(cr) if val_cost < best_val: best_val = val_cost best_cr = cr if fusiontype == 'adasum': adascale_param = las.layers.get_all_param_values( l_fuse, scaling_param=True) test_cr, test_conf = evaluate_model(X_test, y_test, mask_test, dct_test, X_diff_test, WINDOW_SIZE, val_fn) print( "Epoch {} train cost = {}, val cost = {}, " "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f}, Test CR= {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, test_cr, time.time() - time_start)) else: print("Epoch {} train cost = {}, val cost = {}, " "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f} ({:.1f}sec)". format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start)) if epoch >= VALIDATION_WINDOW and early_stop(val_window): break # learning rate decay if epoch + 1 >= decay_start: lr.set_value(lr.get_value() * lr_decay) numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] print('Final Model') print('CR: {}, val loss: {}, Test CR: {}'.format(best_cr, best_val, test_cr)) if fusiontype == 'adasum': print("final scaling params: {}".format(adascale_param)) print('confusion matrix: ') plot_confusion_matrix(test_conf, numbers, fmt='latex') plot_validation_cost(cost_train, cost_val, savefilename='valid_cost') if options['write_results']: results_file = options['write_results'] with open(results_file, mode='a') as f: f.write('{},{},{}\n'.format(fusiontype, test_cr, best_val))
def construct_lstm(input_size, lstm_size, output_size, train_data_gen, val_data_gen): # All gates have initializers for the input-to-gate and hidden state-to-gate # weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity. # The convention is that gates use the standard sigmoid nonlinearity, # which is the default for the Gate class. gate_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), b=las.init.Constant(0.)) cell_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) # prepare the input layers # By setting the first and second dimensions to None, we allow # arbitrary minibatch sizes with arbitrary sequence lengths. # The number of feature dimensions is 150, as described above. l_in = InputLayer(shape=(None, None, input_size)) # This input will be used to provide the network with masks. # Masks are expected to be matrices of shape (n_batch, n_time_steps); # both of these dimensions are variable for us so we will use # an input shape of (None, None) l_mask = InputLayer(shape=(None, None)) # Our LSTM will have 180 hidden/cell units as published in paper N_HIDDEN = lstm_size l_lstm = LSTMLayer( l_in, N_HIDDEN, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5.) # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back = LSTMLayer( l_in, N_HIDDEN, ingate=gate_parameters, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum = ElemwiseSumLayer([l_lstm, l_lstm_back]) # implement drop-out regularization l_dropout = DropoutLayer(l_sum) l_lstm2 = LSTMLayer( l_dropout, N_HIDDEN, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5.) # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back2 = LSTMLayer( l_dropout, N_HIDDEN, ingate=gate_parameters, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm_back2]) ''' l_dropout2 = DropoutLayer(l_sum2) l_lstm3 = LSTMLayer( l_dropout2, N_HIDDEN, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5.) # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back3 = LSTMLayer( l_dropout2, N_HIDDEN, ingate=gate_parameters, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum3 = ElemwiseSumLayer([l_lstm3, l_lstm_back3]) ''' # The l_forward layer creates an output of dimension (batch_size, SEQ_LENGTH, N_HIDDEN) # Since we are only interested in the final prediction, we isolate that quantity and feed it to the next layer. # The output of the sliced layer will then be of size (batch_size, N_HIDDEN) l_forward_slice = SliceLayer(l_sum2, -1, 1) # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_out = DenseLayer( l_forward_slice, num_units=output_size, nonlinearity=las.nonlinearities.softmax) # Now, the shape will be n_batch*n_timesteps, output_size. We can then reshape to # n_batch, n_timesteps to get a single value for each timstep from each sequence # l_out = las.layers.ReshapeLayer(l_dense, (n_batch, n_time_steps)) # Symbolic variable for the target network output. # It will be of shape n_batch, because there's only 1 target value per sequence. target_values = T.ivector('target_output') # This matrix will tell the network the length of each sequences. # The actual values will be supplied by the gen_data function. mask = T.matrix('mask') # lasagne.layers.get_output produces an expression for the output of the net network_output = las.layers.get_output(l_out) # The value we care about is the final value produced for each sequence # so we simply slice it out. # predicted_values = network_output[:, -1] # Our cost will be categorical cross entropy error cost = T.mean(las.objectives.categorical_crossentropy(network_output, target_values)) # cost = T.mean((predicted_values - target_values) ** 2) # Retrieve all parameters from the network all_params = las.layers.get_all_params(l_out) # Compute adam updates for training # updates = las.updates.adam(cost, all_params) updates = adadelta(cost, all_params) # Theano functions for training and computing cost train = theano.function( [l_in.input_var, target_values, l_mask.input_var], cost, updates=updates, allow_input_downcast=True) compute_cost = theano.function( [l_in.input_var, target_values, l_mask.input_var], cost, allow_input_downcast=True) probs = theano.function([l_in.input_var, l_mask.input_var], network_output, allow_input_downcast=True) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val = next(val_data_gen) # We'll train the network with 10 epochs of 100 minibatches each cost_train = [] cost_val = [] class_rate = [] NUM_EPOCHS = 20 EPOCH_SIZE = 26 for epoch in range(NUM_EPOCHS): for _ in range(EPOCH_SIZE): X, y, m = next(train_data_gen) train(X, y, m) cost_train.append(compute_cost(X, y, m)) cost_val.append(compute_cost(X_val, y_val, mask_val)) cr, _ = evaluate_model(X_val, y_val, mask_val, probs) class_rate.append(cr) # one good value to early stop using GL technique, alpha = 0.10 (10% worst) gl = cost_val[-1] / np.min(cost_val) - 1 # PQ, GL / Pk(t) where Pk(t) = 1000 * (sum(training strip error) / k * min(training strip error) - 1 print("Epoch {} train cost = {}, validation cost = {}, generalization loss = {}, classification rate = {}" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, cr)) cr, conf = evaluate_model(X_val, y_val, mask_val, probs) print('Final Model') print('classification rate: {}'.format(cr)) print('confusion matrix: ') plot_confusion_matrix(conf, fmt='grid') plot_validation_cost(cost_train, cost_val, class_rate)
def construct_lstm(input_size, lstm_size, output_size, train_data_gen, val_data_gen): # All gates have initializers for the input-to-gate and hidden state-to-gate # weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity. # The convention is that gates use the standard sigmoid nonlinearity, # which is the default for the Gate class. gate_parameters = Gate(W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), b=las.init.Constant(0.)) cell_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) # prepare the input layers # By setting the first and second dimensions to None, we allow # arbitrary minibatch sizes with arbitrary sequence lengths. # The number of feature dimensions is 150, as described above. l_in = InputLayer(shape=(None, None, input_size), name='input') # This input will be used to provide the network with masks. # Masks are expected to be matrices of shape (n_batch, n_time_steps); # both of these dimensions are variable for us so we will use # an input shape of (None, None) l_mask = InputLayer(shape=(None, None), name='mask') # Our LSTM will have 250 hidden/cell units N_HIDDEN = lstm_size l_lstm = LSTMLayer( l_in, N_HIDDEN, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm1') ''' # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back = LSTMLayer( l_in, N_HIDDEN, ingate=gate_parameters, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum = ElemwiseSumLayer([l_lstm, l_lstm_back]) # implement drop-out regularization l_dropout = DropoutLayer(l_sum) l_lstm2 = LSTMLayer( l_dropout, N_HIDDEN, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5.) # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back2 = LSTMLayer( l_dropout, N_HIDDEN, ingate=gate_parameters, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm_back2]) ''' # The l_forward layer creates an output of dimension (batch_size, SEQ_LENGTH, N_HIDDEN) # Since we are only interested in the final prediction, we isolate that quantity and feed it to the next layer. # The output of the sliced layer will then be of size (batch_size, N_HIDDEN) l_forward_slice = SliceLayer(l_lstm, -1, 1, name='slice') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_out = DenseLayer(l_forward_slice, num_units=output_size, nonlinearity=las.nonlinearities.softmax, name='output') print_network(l_out) # draw_to_file(las.layers.get_all_layers(l_out), 'network.png') # Symbolic variable for the target network output. # It will be of shape n_batch, because there's only 1 target value per sequence. target_values = T.ivector('target_output') # This matrix will tell the network the length of each sequences. # The actual values will be supplied by the gen_data function. mask = T.matrix('mask') # lasagne.layers.get_output produces an expression for the output of the net prediction = las.layers.get_output(l_out) # The value we care about is the final value produced for each sequence # so we simply slice it out. # predicted_values = network_output[:, -1] # Our cost will be categorical cross entropy error cost = T.mean( las.objectives.categorical_crossentropy(prediction, target_values)) # cost = T.mean((predicted_values - target_values) ** 2) # Retrieve all parameters from the network all_params = las.layers.get_all_params(l_out, trainable=True) # Compute adam updates for training # updates = las.updates.adam(cost, all_params) updates = adadelta(cost, all_params) # Theano functions for training and computing cost train = theano.function([l_in.input_var, target_values, l_mask.input_var], cost, updates=updates, allow_input_downcast=True) compute_train_cost = theano.function( [l_in.input_var, target_values, l_mask.input_var], cost, allow_input_downcast=True) test_prediction = las.layers.get_output(l_out, deterministic=True) test_cost = T.mean( las.objectives.categorical_crossentropy(test_prediction, target_values)) compute_val_cost = theano.function( [l_in.input_var, target_values, l_mask.input_var], test_cost, allow_input_downcast=True) val_fn = theano.function([l_in.input_var, l_mask.input_var], test_prediction, allow_input_downcast=True) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val = next(val_data_gen) # We'll train the network with 10 epochs of 100 minibatches each cost_train = [] cost_val = [] class_rate = [] best_val = float('inf') best_conf = None best_cr = 0.0 NUM_EPOCHS = 30 EPOCH_SIZE = 26 STRIP_SIZE = 3 MAX_LOSS = 0.05 VALIDATION_WINDOW = 4 val_window = circular_list(VALIDATION_WINDOW) train_strip = np.zeros((STRIP_SIZE, )) def early_stop(cost_window): if len(cost_window) < 2: return False else: curr = cost_window[0] for idx, cost in enumerate(cost_window): if curr < cost or idx == 0: curr = cost else: return False return True for epoch in range(NUM_EPOCHS): time_start = time.time() for _ in range(EPOCH_SIZE): X, y, m, _ = next(train_data_gen) train(X, y, m) train_cost = compute_train_cost(X, y, m) val_cost = compute_val_cost(X_val, y_val, mask_val) cr, conf = evaluate_model(X_val, y_val, mask_val, val_fn) cost_train.append(train_cost) cost_val.append(val_cost) class_rate.append(cr) train_strip[epoch % STRIP_SIZE] = train_cost val_window.push(val_cost) gl = 100 * (cost_val[-1] / np.min(cost_val) - 1) pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1) pq = gl / pk print( "Epoch {} train cost = {}, validation cost = {}, " "generalization loss = {:.3f}, GQ = {:.3f}, classification rate = {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start)) if val_cost < best_val: best_val = val_cost best_cr = cr best_conf = conf if epoch >= VALIDATION_WINDOW and early_stop(val_window): break letters = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ] print('Final Model') print('classification rate: {}'.format(best_cr)) print('validation loss: {}'.format(best_val)) print('confusion matrix: ') plot_confusion_matrix(best_conf, letters, fmt='grid') plot_validation_cost(cost_train, cost_val, class_rate)
def _prepare(self, X, y, X_valid=None, y_valid=None, sample_weight=None, whole_dataset_in_device=True): self._stats = [] self._class_label_encoder = LabelEncoder() if self.is_classification is True: self._class_label_encoder.fit(y) self.classes_ = self._class_label_encoder.classes_ y = self._class_label_encoder.transform(y).astype(y.dtype) self.y_train_transformed = y if y_valid is not None: y_valid_transformed = self._class_label_encoder.transform( y_valid).astype(y_valid.dtype) self._l_x_in = layers.InputLayer(shape=(None, X.shape[1])) batch_index, X_batch, y_batch, batch_slice = get_theano_batch_variables( self.batch_size, y_softmax=self.is_classification) if sample_weight is not None: t_sample_weight = T.vector('sample_weight') sample_weight = sample_weight.astype(theano.config.floatX) else: t_sample_weight = T.scalar('sample_weight') if self.is_classification is True: y_dim = len(set(y.flatten().tolist())) else: y_dim = y.shape[1] self._prediction_layer = self._build_model(y_dim) self._layers = layers.get_all_layers(self._prediction_layer) self._build_prediction_functions(X_batch, self._prediction_layer) if self.input_noise_function is None: output = layers.get_output(self._prediction_layer, X_batch) else: X_batch_noisy = self.input_noise_function(X_batch) output = layers.get_output(self._prediction_layer, X_batch_noisy) if self.is_classification: loss = -T.mean(t_sample_weight * T.log(output) [T.arange(y_batch.shape[0]), y_batch]) else: loss = T.mean( t_sample_weight * T.sum((output - y_batch) ** 2, axis=1)) loss_unreg = loss all_params = layers.get_all_params(self._prediction_layer) if self._output_softener_coefs is not None: all_params.append(self._output_softener_coefs) W_params = layers.get_all_param_values( self._prediction_layer, regularizable=True) # regularization if self.L1_factor is not None: for L1_factor_layer, W in zip(self.L1_factor, W_params): loss = loss + L1_factor_layer * T.sum(abs(W)) if self.L2_factor is not None: for L2_factor_layer, W in zip(self.L2_factor, W_params): loss = loss + L2_factor_layer * T.sum(W**2) if self.optimization_method == 'nesterov_momentum': gradient_updates = updates.nesterov_momentum(loss, all_params, learning_rate=self.learning_rate, momentum=self.momentum) elif self.optimization_method == 'adadelta': # don't need momentum there gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'adam': gradient_updates = updates.Adam( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'momentum': gradient_updates = updates.momentum( loss, all_params, learning_rate=self.learning_rate, momentum=self.momentum ) elif self.optimization_method == 'adagrad': gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'rmsprop': gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'sgd': gradient_updates = updates.sgd( loss, all_params, learning_rate=self.learning_rate, ) else: raise Exception("wrong optimization method") nb_batches = X.shape[0] // self.batch_size if (X.shape[0] % self.batch_size) != 0: nb_batches += 1 X = X.astype(theano.config.floatX) if self.is_classification == True: y = y.astype(np.int32) else: y = y.astype(theano.config.floatX) if whole_dataset_in_device == True: X_shared = theano.shared(X, borrow=True) y_shared = theano.shared(y, borrow=True) givens = { X_batch: X_shared[batch_slice], y_batch: y_shared[batch_slice] } if sample_weight is not None: sample_weight_shared = theano.shared( sample_weight, borrow=True) givens[t_sample_weight] = sample_weight_shared[batch_slice] else: givens[t_sample_weight] = T.as_tensor_variable( np.array(1., dtype=theano.config.floatX)) iter_update_batch = theano.function( [batch_index], loss, updates=gradient_updates, givens=givens, ) else: if sample_weight is None: iter_update_gradients = theano.function( [X_batch, y_batch], loss, updates=gradient_updates, givens={t_sample_weight: T.as_tensor_variable( np.array(1., dtype=theano.config.floatX))}, ) def iter_update_batch(batch_index): sl = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) return iter_update_gradients(X[sl], y[sl]) else: iter_update_gradients = theano.function( [X_batch, y_batch, t_sample_weight], loss, updates=gradient_updates ) def iter_update_batch(batch_index): sl = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) return iter_update_gradients(X[sl], y[sl], sample_weight[sl]) self._iter_update_batch = iter_update_batch self._get_loss = theano.function( [X_batch, y_batch, t_sample_weight], loss_unreg, allow_input_downcast=True) def iter_update(epoch): losses = [] #self.learning_rate.set_value(self.learning_rate.get_value() * np.array(0.99, dtype=theano.config.floatX)) for i in xrange(nb_batches): losses.append(self._iter_update_batch(i)) # max norm if self.max_norm is not None: for max_norm_layer, layer in zip(self.max_norm, self._layers): layer.W = updates.norm_constraint( layer.W, self.max_norm) losses = np.array(losses) d = OrderedDict() d["epoch"] = epoch #d["loss_train_std"] = losses.std() #d["loss_train"] = losses.mean() d["loss_train"] = self._get_loss( self.X_train, self.y_train_transformed, 1.) d["accuracy_train"] = ( self.predict(self.X_train) == self.y_train).mean() if X_valid is not None and y_valid is not None: d["loss_valid"] = self._get_loss( X_valid, y_valid_transformed, 1.) if self.is_classification == True: d["accuracy_valid"] = ( self.predict(X_valid) == y_valid).mean() if self.verbose > 0: if (epoch % self.report_each) == 0: print(tabulate([d], headers="keys")) self._stats.append(d) return d def quitter(update_status): cur_epoch = len(self._stats) - 1 if self.patience_nb_epochs > 0: # patience heuristic (for early stopping) cur_patience_stat = update_status[self.patience_stat] if self.cur_best_patience_stat is None: self.cur_best_patience_stat = cur_patience_stat first_time = True else: first_time = False thresh = self.patience_progression_rate_threshold if cur_patience_stat < self.cur_best_patience_stat * thresh or first_time: if self.verbose >= 2: fmt = "--Early stopping-- good we have a new best value : {0}={1}, last best : epoch {2}, value={3}" print(fmt.format(self.patience_stat, cur_patience_stat, self.cur_best_epoch, self.cur_best_patience_stat)) self.cur_best_epoch = cur_epoch self.cur_best_patience_stat = cur_patience_stat if hasattr(self, "set_state") and hasattr(self, "get_state"): self.cur_best_model = self.get_state() else: self.cur_best_model = pickle.dumps( self.__dict__, protocol=pickle.HIGHEST_PROTOCOL) if (cur_epoch - self.cur_best_epoch) >= self.patience_nb_epochs: finish = True if hasattr(self, "set_state") and hasattr(self, "get_state"): self.set_state(self.cur_best_model) else: self.__dict__.update(pickle.loads(self.cur_best_model)) self._stats = self._stats[0:self.cur_best_epoch + 1] if self.verbose >= 2: print("out of patience...take the model at epoch {0} and quit".format( self.cur_best_epoch + 1)) else: finish = False return finish else: return False def monitor(update_status): pass def observer(monitor_output): pass return (iter_update, quitter, monitor, observer)
def event_span_classifier(args, input_var, input_mask_var, target_var, wordEmbeddings, seqlen): print("Building model with LSTM") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] GRAD_CLIP = wordDim args.lstmDim = 150 input = InputLayer((None, seqlen),input_var=input_var) batchsize, seqlen = input.input_var.shape input_mask = InputLayer((None, seqlen),input_var=input_mask_var) emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb_1.W].remove('trainable') lstm = LSTMLayer(emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh) lstm_back = LSTMLayer( emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh, backwards=True) slice_forward = SliceLayer(lstm, indices=-1, axis=1) # out_shape (None, args.lstmDim) slice_backward = SliceLayer(lstm_back, indices=0, axis=1) # out_shape (None, args.lstmDim) concat = ConcatLayer([slice_forward, slice_backward]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, lstm:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, input_mask_var,target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, input_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
layer1_input = T.concatenate(layer1_inputs,1) input_dims = feature_maps*len(filter_hs) regess = RegressionNeuralNetwork(rng, input=layer1_input,n_in=input_dims,n_hidden=100,n_out=1,activation=[Sigmoid,Sigmoid]) mse = regess.entropy(Y) L2 = sum([conv_layer.L2 for conv_layer in conv_layers]) + regess.L2 cost = mse + L2 params = regess.params for conv_layer in conv_layers: params+=conv_layer.params updates = adadelta(cost,params) train_model = theano.function([X,Y],[mse, cost],updates=updates) valid_model = theano.function([X,Y],[mse, cost]) showfunction = theano.function(inputs=[X],outputs=regess.regressionlayer.y_pred) patience = 0 best_valid_mse_global = 100 early_stop = 20 epoch_i = 0 train_rand_idxs = list(range(0,X_train.shape[0])) valid_rand_idxs = list(range(0,X_valid.shape[0])) while patience < early_stop:
def __init__(self, rng, n_in, n_per_base, n_out, n_layer=1, basefuncs1=None, basefuncs2=None, gradient=None, with_shortcuts=False): """Initialize the parameters for the multilayer function graph :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_layer: int :param n_layer: number of hidden layers :type n_per_base: int :param n_per_base: number of nodes per basis function see FGLayer :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie :type basefuncs1: [int] :param basefuncs1: see FGLayer :type basefuncs2: [int] :param basefuncs2: see FGLayer :type gradient: string :param gradient: type of gradient descent algo (None=="sgd+","adagrad","adadelta","nag") :type with_shortcuts: bool :param with_shortcuts: whether to use shortcut connections (output is connected to all units) """ self.input = T.matrix('input') # the data is presented as vector input self.labels = T.matrix( 'labels') # the labels are presented as vector of continous values self.rng = rng self.n_layers = n_layer self.hidden_layers = [] self.params = [] self.n_in = n_in self.n_out = n_out self.with_shortcuts = with_shortcuts self.fixL0 = False for l in xrange(n_layer): if l == 0: layer_input = self.input n_input = n_in else: layer_input = self.hidden_layers[l - 1].output n_input = self.hidden_layers[l - 1].n_out hiddenLayer = FGLayer( rng=rng, inp=layer_input, n_in=n_input, n_per_base=n_per_base, basefuncs1=basefuncs1, basefuncs2=basefuncs2, layer_idx=l, ) self.hidden_layers.append(hiddenLayer) self.params.extend(hiddenLayer.params) div_thresh = T.scalar("div_thresh") # The linear output layer, either it gets as input the output of ALL previous layers if self.with_shortcuts: output_layer_inp = T.concatenate( [l.output for l in reversed(self.hidden_layers)], axis=1) output_layer_n_in = sum([l.n_out for l in self.hidden_layers]) else: # or just of the last hidden layer output_layer_inp = self.hidden_layers[-1].output output_layer_n_in = self.hidden_layers[-1].n_out self.output_layer = DivisionRegression(rng=rng, inp=output_layer_inp, n_in=output_layer_n_in, n_out=n_out, div_thresh=div_thresh) self.params.extend(self.output_layer.params) self.evalfun = theano.function( inputs=[self.input, In(div_thresh, value=0.0001)], outputs=self.output_layer.output) L1_reg = T.scalar('L1_reg') L2_reg = T.scalar('L2_reg') fixL0 = T.bscalar('fixL0') self.L1 = self.output_layer.L1 + sum( [l.L1 for l in self.hidden_layers]) self.L2_sqr = self.output_layer.L2_sqr + sum( [l.L2_sqr for l in self.hidden_layers]) self.penalty = self.output_layer.penalty self.loss = self.output_layer.loss self.errors = self.loss self.cost = (self.loss(self.labels) + L1_reg * self.L1 + L2_reg * self.L2_sqr + self.penalty) #Extrapol penalty self.extrapol_cost = self.output_layer.extrapol_loss learning_rate = T.scalar('learning_rate') def process_updates(par, newp): # print par.name if par.name == "W": # if fixL0 is True, then keep small weights at 0 return par, ifelse( fixL0, T.switch(T.abs_(par) < 0.001, par * 0, newp), newp) return par, newp print "Gradient:", gradient update = None if gradient == 'sgd+' or gradient == 'sgd' or gradient == None: gparams = [T.grad(self.cost, param) for param in self.params] update = OrderedDict([ (param, param - (learning_rate * gparam).clip(-1.0, 1.0)) for param, gparam in zip(self.params, gparams) ]) elif gradient == 'adam': update = Lupdates.adam(self.cost, self.params, learning_rate, epsilon=1e-04) elif gradient == 'adadelta': update = Lupdates.adadelta(self.cost, self.params, learning_rate) elif gradient == 'rmsprop': update = Lupdates.rmsprop(self.cost, self.params, learning_rate) elif gradient == 'nag': update = Lupdates.nesterov_momentum(self.cost, self.params, learning_rate) else: assert ("unknown gradient " + gradient) #Extrapol sanity gradient computation: extrapol_updates = Lupdates.adam(self.extrapol_cost, self.params, learning_rate, epsilon=1e-04) updates = [process_updates(*up) for up in update.items()] self.train_model = theano.function( inputs=[ self.input, self.labels, L1_reg, L2_reg, fixL0, learning_rate, div_thresh ], outputs=self.cost, updates=updates, ) # avoid too large outputs in extrapolation domain self.remove_extrapol_error = theano.function( inputs=[self.input, learning_rate, div_thresh], outputs=self.extrapol_cost, updates=extrapol_updates, ) self.test_model = theano.function( inputs=[self.input, self.labels, In(div_thresh, value=0.0001)], outputs=self.errors(self.labels), ) self.validate_model = theano.function( inputs=[self.input, self.labels, In(div_thresh, value=0.0001)], outputs=self.errors(self.labels), ) self.L1_loss = theano.function( inputs=[], outputs=self.L1, ) self.MSE = theano.function( inputs=[self.input, self.labels, In(div_thresh, value=0.0001)], outputs=self.errors(self.labels), )
def __init__(self, hidden_size=100, nclasses=73, num_embeddings=11359, embedding_dim=100, window_size=1, memory_size=40, n_memory_slots=8, go_code=1, depth=2, load_dir=None): articles, titles = T.imatrices('articles', 'titles') n_article_slots = int(n_memory_slots / 2) # TODO derive this from an arg n_title_slots = n_memory_slots - n_article_slots n_instances = articles.shape[0] self.window_size = window_size randoms = { # attr: shape # 'emb': (num_embeddings + 1, embedding_dim), 'M_a': (memory_size, n_article_slots), 'M_t': (memory_size, n_title_slots), 'w_a': (n_article_slots,), 'w_t': (n_title_slots,), 'Wg_a': (window_size * embedding_dim, n_article_slots), 'Wg_t': (window_size * embedding_dim, n_title_slots), 'Wk': (hidden_size, memory_size), 'Wb': (hidden_size, 1), 'Wv': (hidden_size, memory_size), 'We_a': (hidden_size, n_article_slots), 'We_t': (hidden_size, n_title_slots), 'Wx': (window_size * embedding_dim, hidden_size), 'Wh': (memory_size, hidden_size), 'W': (hidden_size, nclasses), 'h0': hidden_size } zeros = { # attr: shape 'bg_a': n_article_slots, 'bg_t': n_title_slots, 'bk': memory_size, 'bb': 1, 'bv': memory_size, 'be_a': n_article_slots, 'be_t': n_title_slots, 'bh': hidden_size, 'b': nclasses, } for l in range(depth): randoms['gru' + str(l)] = (1, embedding_dim) def random_shared(name): shape = randoms[name] return theano.shared( 0.2 * np.random.normal(size=shape).astype(theano.config.floatX), name=name) def zeros_shared(name): shape = zeros[name] return theano.shared(np.zeros(shape, dtype=theano.config.floatX), name=name) for key in randoms: # create an attribute with associated shape and random values setattr(self, key, random_shared(key)) for key in zeros: # create an attribute with associated shape and values equal to 0 setattr(self, key, zeros_shared(key)) self.names = randoms.keys() + zeros.keys() # self.names.remove('emb') # no need to save or update embeddings scan_vars = 'h0 w_a M_a w_t M_t'.split() def repeat_for_each_instance(param): """ repeat param along new axis once for each instance """ return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0) for key in scan_vars: setattr(self, key, repeat_for_each_instance(self.__getattribute__(key))) self.names.remove(key) if load_dir is not None: with open(os.path.join(load_dir, 'params.pkl')) as handle: params = pickle.load(handle) self.__dict__.update(params) def recurrence(i, h_tm1, w_a, M_a, *args, **kwargs): """ notes Headers from paper in all caps mem = n_article slots if is_article else n_title_slots :param i: center index of sliding window :param h_tm1: h_{t-1} (hidden state) :param w_a: attention weights for article memory :param M_a: article memory :param args: gru_weights, maybe w_t, maybe M_t gru_weights: weights with which to initialize GRULayer on each time step w_t: attention weights for titles memory M_t: titles memory :param kwargs: is_training, is_article is_training: is_article: we use different parts of memory when working with a article :return: [y = model outputs, i + 1 = increment index, h w, M (see above)] """ is_training = kwargs['is_training'] is_article = kwargs['is_article'] gru_weights = args[:depth] if len(args) > depth: w_t = args[depth] M_t = args[depth + 1] i_type = T.iscalar if is_article or is_training else T.ivector assert i.type == i_type if not is_article: assert w_t is not None and M_t is not None word_idxs = i if is_article or is_training: # get representation of word window document = articles if is_article else titles # [instances, bucket_width] word_idxs = document[:, i:i+1] # [instances, 1] # x_i = self.emb[word_idxs].flatten(ndim=2) # [instances, embedding_dim] input = InputLayer(shape=(None, 1), input_var=word_idxs) embed = EmbeddingLayer(input, num_embeddings, embedding_dim) gru = GRULayer(incoming=embed, num_units=embedding_dim, hid_init=self.gru0) for weight in gru_weights: gru = GRULayer(incoming=gru, num_units=embedding_dim, hid_init=weight) x_i = get_output(gru).flatten(ndim=2) x_i = Print('x_i')(x_i) # [instances, embedding_dim] gru_weights = [] if is_article: M_read = M_a # [instances, memory_size, n_article_slots] w_read = w_a # [instances, n_article_slots] else: M_read = T.concatenate([M_a, M_t], axis=2) # [instances, memory_size, n_title_slots] w_read = T.concatenate([w_a, w_t], axis=1) # [instances, n_title_slots] # eqn 15 c = T.batched_dot(M_read, w_read) # [instances, memory_size] # EXTERNAL MEMORY READ def get_attention(Wg, bg, M, w): g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg) # [instances, mem] # eqn 11 k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # eqn 13 beta = T.dot(h_tm1, self.Wb) + self.bb beta = T.nnet.softplus(beta) beta = T.addbroadcast(beta, 1) # [instances, 1] # eqn 12 w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # eqn 14 return (1 - g) * w + g * w_hat # [instances, mem] w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a) # [instances, n_article_slots] if not is_article: w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t) # [instances, n_title_slots] # MODEL INPUT AND OUTPUT # eqn 9 h = T.dot(c, self.Wh) + T.dot(x_i, self.Wx) + self.bh # [instances, hidden_size] # eqn 10 y = T.nnet.softmax(T.dot(h, self.W) + self.b) # [instances, nclasses] # EXTERNAL MEMORY UPDATE def update_memory(We, be, w_update, M_update): # eqn 17 e = T.nnet.sigmoid(T.dot(h_tm1, We) + be) # [instances, mem] f = 1. - w_update * e # [instances, mem] # eqn 16 v = T.tanh(T.dot(h, self.Wv) + self.bv) # [instances, memory_size] # need to add broadcast layers for memory update f = f.dimshuffle(0, 'x', 1) # [instances, 1, mem] u = w_update.dimshuffle(0, 'x', 1) # [instances, 1, mem] v = v.dimshuffle(0, 1, 'x') # [instances, memory_size, 1] # eqn 19 return M_update * f + T.batched_dot(v, u) * (1 - f) # [instances, memory_size, mem] M_a = update_memory(self.We_a, self.be_a, w_a, M_a) attention_and_memory = [w_a, M_a] if not is_article: M_t = update_memory(self.We_t, self.be_t, w_t, M_t) attention_and_memory += [w_t, M_t] y_max = y.argmax(axis=1).astype(int32) next_idxs = i + 1 if is_training or is_article else y_max return [y, y_max, next_idxs, h] + attention_and_memory read_article = partial(recurrence, is_training=True, is_article=True) # for read_article, it actually doesn't matter whether is_training is true i0 = T.constant(0, dtype=int32, name='first_value_of_i') gru_weights = [eval('self.gru' + str(l)) for l in range(depth)] outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a] + gru_weights [_, _, _, h, w, M], _ = theano.scan(fn=read_article, outputs_info=outputs_info, n_steps=articles.shape[1], name='read_scan') produce_title = partial(recurrence, is_training=True, is_article=False) outputs_info[3:6] = [param[-1, :, :] for param in (h, w, M)] outputs_info.extend([self.w_t, self.M_t]) bucket_width = titles.shape[1] - 1 # subtract 1 because <go> is omitted in y_true [y, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title, outputs_info=outputs_info, n_steps=bucket_width, name='train_scan') # loss and updates y_clip = T.clip(y, .01, .99) y_flatten = y_clip.dimshuffle(2, 1, 0).flatten(ndim=2).T y_true = titles[:, 1:].ravel() # [:, 1:] in order to omit <go> counts = T.extra_ops.bincount(y_true, assert_nonneg=True) weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0) losses = T.nnet.categorical_crossentropy(y_flatten, y_true) loss = objectives.aggregate(losses, weights, mode='sum') updates = adadelta(loss, self.params()) self.learn = theano.function(inputs=[articles, titles], outputs=[y_max.T, loss], updates=updates, allow_input_downcast=True, name='learn') produce_title_test = partial(recurrence, is_training=False, is_article=False) self.test = theano.function(inputs=[articles, titles], outputs=[y_max.T], on_unused_input='ignore') outputs_info[2] = T.zeros([n_instances], dtype=int32) + go_code [_, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title_test, outputs_info=outputs_info, n_steps=bucket_width, name='test_scan') self.predict = theano.function(inputs=[articles, titles], outputs=y_max.T, name='infer')
regess.set_params(save_params[:4]) for i in xrange(len(filter_hs)): print(4 + i * 2) print(4 + i * 2 + 2) conv_layers[i].set_params(save_params[4 + i * 2:4 + i * 2 + 2]) mse = regess.mse(Y) L2 = sum([conv_layer.L2 for conv_layer in conv_layers]) + regess.L2 cost = mse + L2 params = regess.params for conv_layer in conv_layers: params += conv_layer.params updates = adadelta(cost, params) train_model = theano.function([X, Y], [mse, cost], updates=updates) valid_model = theano.function([X, Y], [mse, cost]) showfunction = theano.function(inputs=[X], outputs=regess.hiddenlayer.output) X_mnb = X_valid[:batch_size] Y_mnb = Y_valid_rouge2[:batch_size] print(X_mnb.shape, X_mnb.dtype, Y_mnb, Y_mnb.dtype) pred = showfunction(X_mnb) print pred print Y_valid_rouge2[:batch_size]
def main(): configure_theano() options = parse_options() config_file = 'config/leave_one_out.ini' print('loading config file: {}'.format(config_file)) config = ConfigParser.ConfigParser() config.read(config_file) print('preprocessing dataset...') data = load_mat_file(config.get('data', 'images')) dct_data = load_mat_file(config.get('data', 'dct')) ae_pretrained = config.get('models', 'pretrained') ae_finetuned = config.get('models', 'finetuned') ae_finetuned_diff = config.get('models', 'finetuned_diff') learning_rate = float(config.get('training', 'learning_rate')) decay_rate = float(config.get('training', 'decay_rate')) decay_start = int(config.get('training', 'decay_start')) do_finetune = config.getboolean('training', 'do_finetune') save_finetune = config.getboolean('training', 'save_finetune') load_finetune = config.getboolean('training', 'load_finetune') load_finetune_diff = config.getboolean('training', 'load_finetune_diff') # 53 subjects, 70 utterances, 5 view angles # s[x]_v[y]_u[z].mp4 # resized, height, width = (26, 44) # ['dataMatrix', 'targetH', 'targetsPerVideoVec', 'videoLengthVec', '__header__', 'targetsVec', # '__globals__', 'iterVec', 'filenamesVec', 'dataMatrixCells', 'subjectsVec', 'targetW', '__version__'] print(data.keys()) X = data['dataMatrix'].astype('float32') y = data['targetsVec'].astype('int32') y = y.reshape((len(y), )) dct_feats = dct_data['dctFeatures'].astype('float32') uniques = np.unique(y) print('number of classifications: {}'.format(len(uniques))) subjects = data['subjectsVec'].astype('int') subjects = subjects.reshape((len(subjects), )) video_lens = data['videoLengthVec'].astype('int') video_lens = video_lens.reshape((len(video_lens, ))) # X = reorder_data(X, (26, 44), 'f', 'c') # print('performing sequencewise mean image removal...') # X = sequencewise_mean_image_subtraction(X, video_lens) # visualize_images(X[550:650], (26, 44)) X_diff = compute_diff_images(X, video_lens) # mean remove dct features dct_feats = sequencewise_mean_image_subtraction(dct_feats, video_lens) test_subject_ids = [options['test_subj']] train_subject_ids = range(1, 54) for subj in test_subject_ids: train_subject_ids.remove(subj) if 'results' in options: results_file = options['results'] f = open(results_file, mode='a') print(train_subject_ids) print(test_subject_ids) train_X, train_y, train_dct, train_X_diff, train_vidlens, train_subjects, \ test_X, test_y, test_dct, test_X_diff, test_vidlens, test_subjects = \ split_data(X, y, dct_feats, X_diff, subjects, video_lens, train_subject_ids, test_subject_ids) assert train_X.shape[0] + test_X.shape[0] == len(X) assert train_y.shape[0] + test_y.shape[0] == len(y) assert train_vidlens.shape[0] + test_vidlens.shape[0] == len(video_lens) assert train_subjects.shape[0] + test_subjects.shape[0] == len(subjects) train_X = normalize_input(train_X, centralize=True) test_X = normalize_input(test_X, centralize=True) # featurewise normalize dct features train_dct, dct_mean, dct_std = featurewise_normalize_sequence(train_dct) test_dct = (test_dct - dct_mean) / dct_std if do_finetune: print('performing finetuning on pretrained encoder: {}'.format( ae_pretrained)) ae = load_dbn(ae_pretrained) ae.initialize() ae.fit(train_X, train_X) if save_finetune: print('saving finetuned encoder: {}...'.format(ae_finetuned)) pickle.dump(ae, open(ae_finetuned, 'wb')) if load_finetune: print('loading finetuned encoder: {}...'.format(ae_finetuned)) ae = pickle.load(open(ae_finetuned, 'rb')) ae.initialize() if load_finetune_diff: print('loading finetuned encoder: {}...'.format(ae_finetuned_diff)) ae_diff = pickle.load(open(ae_finetuned_diff, 'rb')) ae_diff.initialize() # IMPT: the encoder was trained with fortan ordered images, so to visualize # convert all the images to C order using reshape_images_order() # output = dbn.predict(test_X) # test_X = reshape_images_order(test_X, (26, 44)) # output = reshape_images_order(output, (26, 44)) # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44)) window = T.iscalar('theta') dct = T.tensor3('dct', dtype='float32') inputs = T.tensor3('inputs', dtype='float32') inputs_diff = T.tensor3('inputs_diff', dtype='float32') mask = T.matrix('mask', dtype='uint8') targets = T.ivector('targets') lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX), name='learning_rate') lr_decay = np.array(decay_rate, dtype=theano.config.floatX) print('constructing end to end model...') ''' network = create_end_to_end_model(dbn, (None, None, 1144), inputs, (None, None), mask, 250, window) ''' network = adenet_v5.create_model(ae, ae_diff, (None, None, 1144), inputs, (None, None), mask, (None, None, 90), dct, (None, None, 1144), inputs_diff, 250, window, 10) print_network(network) print('compiling model...') predictions = las.layers.get_output(network, deterministic=False) all_params = las.layers.get_all_params(network, trainable=True) cost = T.mean(las.objectives.categorical_crossentropy( predictions, targets)) updates = adadelta(cost, all_params, learning_rate=lr) # updates = adagrad(cost, all_params, learning_rate=lr) use_max_constraint = False if use_max_constraint: MAX_NORM = 4 for param in las.layers.get_all_params(network, regularizable=True): if param.ndim > 1: # only apply to dimensions larger than 1, exclude biases updates[param] = norm_constraint( param, MAX_NORM * las.utils.compute_norms(param.get_value()).mean()) train = theano.function([inputs, targets, mask, dct, inputs_diff, window], cost, updates=updates, allow_input_downcast=True) compute_train_cost = theano.function( [inputs, targets, mask, dct, inputs_diff, window], cost, allow_input_downcast=True) test_predictions = las.layers.get_output(network, deterministic=True) test_cost = T.mean( las.objectives.categorical_crossentropy(test_predictions, targets)) compute_test_cost = theano.function( [inputs, targets, mask, dct, inputs_diff, window], test_cost, allow_input_downcast=True) val_fn = theano.function([inputs, mask, dct, inputs_diff, window], test_predictions, allow_input_downcast=True) # We'll train the network with 10 epochs of 30 minibatches each print('begin training...') cost_train = [] cost_val = [] class_rate = [] NUM_EPOCHS = 10 EPOCH_SIZE = 120 BATCH_SIZE = 10 WINDOW_SIZE = 9 STRIP_SIZE = 3 MAX_LOSS = 0.2 VALIDATION_WINDOW = 4 val_window = circular_list(VALIDATION_WINDOW) train_strip = np.zeros((STRIP_SIZE, )) best_val = float('inf') best_conf = None best_cr = 0.0 datagen = gen_lstm_batch_random(train_X, train_y, train_vidlens, batchsize=BATCH_SIZE) val_datagen = gen_lstm_batch_random(test_X, test_y, test_vidlens, batchsize=len(test_vidlens)) integral_lens = compute_integral_len(train_vidlens) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val, idxs_val = next(val_datagen) integral_lens_val = compute_integral_len(test_vidlens) dct_val = gen_seq_batch_from_idx(test_dct, idxs_val, test_vidlens, integral_lens_val, np.max(test_vidlens)) X_diff_val = gen_seq_batch_from_idx(test_X_diff, idxs_val, test_vidlens, integral_lens_val, np.max(test_vidlens)) def early_stop(cost_window): if len(cost_window) < 2: return False else: curr = cost_window[0] for idx, cost in enumerate(cost_window): if curr < cost or idx == 0: curr = cost else: return False return True for epoch in range(NUM_EPOCHS): time_start = time.time() for i in range(EPOCH_SIZE): X, y, m, batch_idxs = next(datagen) d = gen_seq_batch_from_idx(train_dct, batch_idxs, train_vidlens, integral_lens, np.max(train_vidlens)) X_diff = gen_seq_batch_from_idx(train_X_diff, batch_idxs, train_vidlens, integral_lens, np.max(train_vidlens)) print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format( epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value())) print(print_str, end='') sys.stdout.flush() train(X, y, m, d, X_diff, WINDOW_SIZE) print('\r', end='') cost = compute_train_cost(X, y, m, d, X_diff, WINDOW_SIZE) val_cost = compute_test_cost(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE) cost_train.append(cost) cost_val.append(val_cost) train_strip[epoch % STRIP_SIZE] = cost val_window.push(val_cost) gl = 100 * (cost_val[-1] / np.min(cost_val) - 1) pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1) pq = gl / pk cr, val_conf = evaluate_model(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE, val_fn) class_rate.append(cr) print( "Epoch {} train cost = {}, validation cost = {}, " "generalization loss = {:.3f}, GQ = {:.3f}, classification rate = {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start)) if val_cost < best_val: best_val = val_cost best_conf = val_conf best_cr = cr if epoch >= VALIDATION_WINDOW and early_stop(val_window): break # learning rate decay if epoch >= decay_start - 1: lr.set_value(lr.get_value() * lr_decay) phrases = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10'] print('Final Model') print('classification rate: {}, validation loss: {}'.format( best_cr, best_val)) print('confusion matrix: ') plot_confusion_matrix(best_conf, phrases, fmt='grid') plot_validation_cost(cost_train, cost_val, class_rate, savefilename='valid_cost') if 'results' in options: print('writing to results file: {}...'.format(options['results'])) f.write('{}, {}, {}\n'.format(test_subject_ids[0], best_cr, best_val)) f.close()
def main(): configure_theano() options = parse_options() config_file = options['config'] config = ConfigParser.ConfigParser() config.read(config_file) print('CLI options: {}'.format(options.items())) print('Reading Config File: {}...'.format(config_file)) print(config.items('data')) print(config.items('models')) print(config.items('training')) print('preprocessing dataset...') data = load_mat_file(config.get('data', 'images')) dct_data = load_mat_file(config.get('data', 'dct')) ae_finetuned = config.get('models', 'finetuned') ae_finetuned_diff = config.get('models', 'finetuned_diff') fusiontype = config.get('models', 'fusiontype') learning_rate = float(config.get('training', 'learning_rate')) decay_rate = float(config.get('training', 'decay_rate')) decay_start = int(config.get('training', 'decay_start')) load_finetune = config.getboolean('training', 'load_finetune') load_finetune_diff = config.getboolean('training', 'load_finetune_diff') train_vidlens = data['trVideoLengthVec'].astype('int').reshape((-1,)) val_vidlens = data['valVideoLengthVec'].astype('int').reshape((-1,)) test_vidlens = data['testVideoLengthVec'].astype('int').reshape((-1,)) train_X = data['trData'].astype('float32') val_X = data['valData'].astype('float32') test_X = data['testData'].astype('float32') train_dct = dct_data['trDctFeatures'].astype('float32') val_dct = dct_data['valDctFeatures'].astype('float32') test_dct = dct_data['testDctFeatures'].astype('float32') train_X_diff = compute_diff_images(train_X, train_vidlens) val_X_diff = compute_diff_images(val_X, val_vidlens) test_X_diff = compute_diff_images(test_X, test_vidlens) train_y = data['trTargetsVec'].astype('int').reshape((-1,)) + 1 # +1 to handle the -1 introduced in lstm_gendata val_y = data['valTargetsVec'].astype('int').reshape((-1,)) + 1 test_y = data['testTargetsVec'].astype('int').reshape((-1,)) + 1 # featurewise normalize dct features train_dct, dct_mean, dct_std = featurewise_normalize_sequence(train_dct) val_dct = (val_dct - dct_mean) / dct_std test_dct = (test_dct - dct_mean) / dct_std if load_finetune: print('loading finetuned encoder: {}...'.format(ae_finetuned)) ae = pickle.load(open(ae_finetuned, 'rb')) ae.initialize() if load_finetune_diff: print('loading finetuned encoder: {}...'.format(ae_finetuned_diff)) ae_diff = pickle.load(open(ae_finetuned_diff, 'rb')) ae_diff.initialize() # IMPT: the encoder was trained with fortan ordered images, so to visualize # convert all the images to C order using reshape_images_order() # output = dbn.predict(test_X) # test_X = reshape_images_order(test_X, (26, 44)) # output = reshape_images_order(output, (26, 44)) # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44)) window = T.iscalar('theta') dct = T.tensor3('dct', dtype='float32') inputs = T.tensor3('inputs', dtype='float32') inputs_diff = T.tensor3('inputs_diff', dtype='float32') mask = T.matrix('mask', dtype='uint8') targets = T.ivector('targets') lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX), name='learning_rate') lr_decay = np.array(decay_rate, dtype=theano.config.floatX) print('constructing end to end model...') network, l_fuse = adenet_v3.create_model(ae, ae_diff, (None, None, 1500), inputs, (None, None), mask, (None, None, 90), dct, (None, None, 1500), inputs_diff, 250, window, 10, fusiontype) print_network(network) # draw_to_file(las.layers.get_all_layers(network), 'network.png') print('compiling model...') predictions = las.layers.get_output(network, deterministic=False) all_params = las.layers.get_all_params(network, trainable=True) cost = T.mean(las.objectives.categorical_crossentropy(predictions, targets)) updates = adadelta(cost, all_params, learning_rate=lr) # updates = adagrad(cost, all_params, learning_rate=lr) train = theano.function( [inputs, targets, mask, dct, inputs_diff, window], cost, updates=updates, allow_input_downcast=True) compute_train_cost = theano.function([inputs, targets, mask, dct, inputs_diff, window], cost, allow_input_downcast=True) test_predictions = las.layers.get_output(network, deterministic=True) test_cost = T.mean(las.objectives.categorical_crossentropy(test_predictions, targets)) compute_test_cost = theano.function( [inputs, targets, mask, dct, inputs_diff, window], test_cost, allow_input_downcast=True) val_fn = theano.function([inputs, mask, dct, inputs_diff, window], test_predictions, allow_input_downcast=True) # We'll train the network with 10 epochs of 30 minibatches each print('begin training...') cost_train = [] cost_val = [] class_rate = [] NUM_EPOCHS = 30 EPOCH_SIZE = 45 BATCH_SIZE = 20 WINDOW_SIZE = 9 STRIP_SIZE = 3 MAX_LOSS = 0.2 VALIDATION_WINDOW = 4 val_window = circular_list(VALIDATION_WINDOW) train_strip = np.zeros((STRIP_SIZE,)) best_val = float('inf') best_conf = None best_cr = 0.0 datagen = gen_lstm_batch_random(train_X, train_y, train_vidlens, batchsize=BATCH_SIZE) integral_lens = compute_integral_len(train_vidlens) val_datagen = gen_lstm_batch_random(val_X, val_y, val_vidlens, batchsize=len(val_vidlens)) test_datagen = gen_lstm_batch_random(test_X, test_y, test_vidlens, batchsize=len(test_vidlens)) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val, idxs_val = next(val_datagen) integral_lens_val = compute_integral_len(val_vidlens) dct_val = gen_seq_batch_from_idx(val_dct, idxs_val, val_vidlens, integral_lens_val, np.max(val_vidlens)) X_diff_val = gen_seq_batch_from_idx(val_X_diff, idxs_val, val_vidlens, integral_lens_val, np.max(val_vidlens)) # we use the test set to check final classification rate X_test, y_test, mask_test, idxs_test = next(test_datagen) integral_lens_test = compute_integral_len(test_vidlens) dct_test = gen_seq_batch_from_idx(test_dct, idxs_test, test_vidlens, integral_lens_test, np.max(test_vidlens)) X_diff_test = gen_seq_batch_from_idx(test_X_diff, idxs_test, test_vidlens, integral_lens_test, np.max(test_vidlens)) def early_stop(cost_window): if len(cost_window) < 2: return False else: curr = cost_window[0] for idx, cost in enumerate(cost_window): if curr < cost or idx == 0: curr = cost else: return False return True for epoch in range(NUM_EPOCHS): time_start = time.time() for i in range(EPOCH_SIZE): X, y, m, batch_idxs = next(datagen) d = gen_seq_batch_from_idx(train_dct, batch_idxs, train_vidlens, integral_lens, np.max(train_vidlens)) X_diff = gen_seq_batch_from_idx(train_X_diff, batch_idxs, train_vidlens, integral_lens, np.max(train_vidlens)) print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format( epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value())) print(print_str, end='') sys.stdout.flush() train(X, y, m, d, X_diff, WINDOW_SIZE) print('\r', end='') cost = compute_train_cost(X, y, m, d, X_diff, WINDOW_SIZE) val_cost = compute_test_cost(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE) cost_train.append(cost) cost_val.append(val_cost) train_strip[epoch % STRIP_SIZE] = cost val_window.push(val_cost) gl = 100 * (cost_val[-1] / np.min(cost_val) - 1) pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1) pq = gl / pk cr, val_conf = evaluate_model(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE, val_fn) class_rate.append(cr) if val_cost < best_val: best_val = val_cost best_cr = cr if fusiontype == 'adasum': adascale_param = las.layers.get_all_param_values(l_fuse, scaling_param=True) test_cr, test_conf = evaluate_model(X_test, y_test, mask_test, dct_test, X_diff_test, WINDOW_SIZE, val_fn) print("Epoch {} train cost = {}, val cost = {}, " "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f}, Test CR= {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, test_cr, time.time() - time_start)) else: print("Epoch {} train cost = {}, val cost = {}, " "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start)) if epoch >= VALIDATION_WINDOW and early_stop(val_window): break # learning rate decay if epoch + 1 >= decay_start: lr.set_value(lr.get_value() * lr_decay) numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] print('Final Model') print('CR: {}, val loss: {}, Test CR: {}'.format(best_cr, best_val, test_cr)) if fusiontype == 'adasum': print("final scaling params: {}".format(adascale_param)) print('confusion matrix: ') plot_confusion_matrix(test_conf, numbers, fmt='latex') plot_validation_cost(cost_train, cost_val, savefilename='valid_cost') if options['write_results']: results_file = options['write_results'] with open(results_file, mode='a') as f: f.write('{},{},{}\n'.format(fusiontype, test_cr, best_val))
def main(): def signal_handler(signal, frame): global terminate terminate = True print('terminating...'.format(terminate)) signal.signal(signal.SIGINT, signal_handler) configure_theano() options = parse_options() X, X_val = generate_data() # X = np.reshape(X, (-1, 1, 30, 40))[:-5] print('X type and shape:', X.dtype, X.shape) print('X.min():', X.min()) print('X.max():', X.max()) # X_val = np.reshape(X_val, (-1, 1, 30, 40))[:-1] print('X_val type and shape:', X_val.dtype, X_val.shape) print('X_val.min():', X_val.min()) print('X_val.max():', X_val.max()) # we need our target to be 1 dimensional X_out = X.reshape((X.shape[0], -1)) X_val_out = X_val.reshape((X_val.shape[0], -1)) print('X_out:', X_out.dtype, X_out.shape) print('X_val_out', X_val_out.dtype, X_val_out.shape) # X_noisy = apply_gaussian_noise(X_out) # visualize_reconstruction(X_noisy[0:25], X_out[0:25], shape=(28, 28)) # X = np.reshape(X_noisy, (-1, 1, 28, 28)) print('constructing and compiling model...') # input_var = T.tensor4('input', dtype='float32') input_var = T.tensor3('input', dtype='float32') target_var = T.matrix('output', dtype='float32') lr = theano.shared(np.array(0.8, dtype=theano.config.floatX), name='learning_rate') lr_decay = np.array(0.9, dtype=theano.config.floatX) # try building a reshaping layer # network = create_model(input_var, (None, 1, 30, 40), options) l_input = InputLayer((None, None, 1200), input_var, name='input') l_input = ReshapeLayer(l_input, (-1, 1, 30, 40), name='reshape_input') # l_input = InputLayer((None, 1, 30, 40), input_var, name='input') if options['MODEL'] == 'normal': network, encoder = avletters_convae.create_model(l_input, options) if options['MODEL'] == 'batchnorm': network, encoder = avletters_convae_bn.create_model(l_input, options) if options['MODEL'] == 'dropout': network, encoder = avletters_convae_drop.create_model(l_input, options) if options['MODEL'] == 'bn+dropout': network, encoder = avletters_convae_bndrop.create_model( l_input, options) print('AE Network architecture: {}'.format(options['MODEL'])) print_network(network) recon = las.layers.get_output(network, deterministic=False) all_params = las.layers.get_all_params(network, trainable=True) cost = T.mean(squared_error(recon, target_var)) updates = adadelta(cost, all_params, lr) # updates = las.updates.apply_nesterov_momentum(updates, all_params, momentum=0.90) use_max_constraint = False print('apply max norm constraint: {}'.format(use_max_constraint)) if use_max_constraint: MAX_NORM = 4 for param in las.layers.get_all_params(network, regularizable=True): if param.ndim > 1: # only apply to dimensions larger than 1, exclude biases # updates[param] = norm_constraint(param, MAX_NORM * las.utils.compute_norms(param.get_value()).mean()) updates[param] = norm_constraint(param, MAX_NORM) train = theano.function([input_var, target_var], recon, updates=updates, allow_input_downcast=True) train_cost_fn = theano.function([input_var, target_var], cost, allow_input_downcast=True) eval_recon = las.layers.get_output(network, deterministic=True) eval_cost = T.mean(las.objectives.squared_error(eval_recon, target_var)) eval_cost_fn = theano.function([input_var, target_var], eval_cost, allow_input_downcast=True) recon_fn = theano.function([input_var], eval_recon, allow_input_downcast=True) if terminate: exit() NUM_EPOCHS = options['NUM_EPOCHS'] EPOCH_SIZE = options['EPOCH_SIZE'] NO_STRIDES = options['NO_STRIDES'] VAL_NO_STRIDES = options['VAL_NO_STRIDES'] print('begin training for {} epochs...'.format(NUM_EPOCHS)) datagen = batch_iterator(X, X_out, 128) costs = [] val_costs = [] for epoch in range(NUM_EPOCHS): time_start = time.time() for i in range(EPOCH_SIZE): batch_X, batch_y = next(datagen) print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format( epoch + 1, i + 1, EPOCH_SIZE, len(batch_X), lr.get_value()) print(print_str, end='') sys.stdout.flush() batch_X = batch_X.reshape((-1, 1, 1200)) train(batch_X, batch_y) print('\r', end='') if terminate: break if terminate: break cost = batch_compute_cost(X, X_out, NO_STRIDES, train_cost_fn) val_cost = batch_compute_cost(X_val, X_val_out, VAL_NO_STRIDES, eval_cost_fn) costs.append(cost) val_costs.append(val_cost) print("Epoch {} train cost = {}, validation cost = {} ({:.1f}sec) ". format(epoch + 1, cost, val_cost, time.time() - time_start)) if epoch > 10: lr.set_value(lr.get_value() * lr_decay) X_val_recon = recon_fn(X_val) visualize_reconstruction(X_val_out[450:550], X_val_recon[450:550], shape=(30, 40), savefilename='avletters') plot_validation_cost(costs, val_costs, None, savefilename='valid_cost') conv2d1 = las.layers.get_all_layers(network)[2] visualize.plot_conv_weights(conv2d1, (15, 14)).savefig('conv2d1.png') print('saving encoder...') save_model(encoder, 'models/conv_encoder.dat') save_model(network, 'models/conv_ae.dat')
def build_network_2dconv( args, input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var, wordEmbeddings, maxlen=36 ): print ("Building model with 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] num_filters = 100 stride = 1 # CNN_sentence config filter_size = (3, wordDim) pool_size = (maxlen - 3 + 1, 1) # two conv pool layer # filter_size=(10, 100) # pool_size=(4,4) input_1 = InputLayer((None, maxlen), input_var=input1_var) batchsize, seqlen = input_1.input_var.shape # input_1_mask = InputLayer((None, maxlen),input_var=input1_mask_var) emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_1.params[emb_1.W].remove("trainable") # (batchsize, maxlen, wordDim) reshape_1 = ReshapeLayer(emb_1, (batchsize, 1, maxlen, wordDim)) conv2d_1 = Conv2DLayer( reshape_1, num_filters=num_filters, filter_size=(filter_size), stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size) # (None, 100, 1, 1) """ filter_size_2=(4, 10) pool_size_2=(2,2) conv2d_1 = Conv2DLayer(maxpool_1, num_filters=num_filters, filter_size=filter_size_2, stride=stride, nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size_2) #(None, 100, 1, 1) (None, 100, 1, 20) """ forward_1 = FlattenLayer(maxpool_1) # (None, 100) #(None, 50400) input_2 = InputLayer((None, maxlen), input_var=input2_var) # input_2_mask = InputLayer((None, maxlen),input_var=input2_mask_var) emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_2.params[emb_2.W].remove("trainable") reshape_2 = ReshapeLayer(emb_2, (batchsize, 1, maxlen, wordDim)) conv2d_2 = Conv2DLayer( reshape_2, num_filters=num_filters, filter_size=filter_size, stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size) # (None, 100, 1, 1) """ conv2d_2 = Conv2DLayer(maxpool_2, num_filters=num_filters, filter_size=filter_size_2, stride=stride, nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size_2) #(None, 100, 1, 1) """ forward_2 = FlattenLayer(maxpool_2) # (None, 100) # elementwisemerge need fix the sequence length mul = ElemwiseMergeLayer([forward_1, forward_2], merge_function=T.mul) sub = AbsSubLayer([forward_1, forward_2], merge_function=T.sub) concat = ConcatLayer([mul, sub]) concat = ConcatLayer([forward_1, forward_2]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) if args.task == "sts": network = DenseLayer(hid, num_units=5, nonlinearity=softmax) elif args.task == "ent": network = DenseLayer(hid, num_units=3, nonlinearity=softmax) # prediction = get_output(network, {input_1:input1_var, input_2:input2_var}) prediction = get_output(network) loss = T.mean(categorical_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = {conv2d_1: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" # test_prediction = get_output(network, {input_1:input1_var, input_2:input2_var}, deterministic=True) test_prediction = get_output(network, deterministic=True) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) """ train_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True) """ train_fn = theano.function([input1_var, input2_var, target_var], loss, updates=updates, allow_input_downcast=True) if args.task == "sts": """ val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True) """ val_fn = theano.function( [input1_var, input2_var, target_var], [test_loss, test_prediction], allow_input_downcast=True ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) """ val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) """ val_fn = theano.function([input1_var, input2_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn
def main(): configure_theano() config_file = 'config/trimodal.ini' print('loading config file: {}'.format(config_file)) config = ConfigParser.ConfigParser() config.read(config_file) print('Reading Config File: {}...'.format(config_file)) print(config.items('data')) print(config.items('models')) print(config.items('training')) print('preprocessing dataset...') data = load_mat_file(config.get('data', 'images')) dct_data = load_mat_file(config.get('data', 'dct')) ae_pretrained = config.get('models', 'pretrained') ae_finetuned = config.get('models', 'finetuned') ae_finetuned_diff = config.get('models', 'finetuned_diff') use_adascale = config.getboolean('models', 'use_adascale') learning_rate = float(config.get('training', 'learning_rate')) decay_rate = float(config.get('training', 'decay_rate')) decay_start = int(config.get('training', 'decay_start')) do_finetune = config.getboolean('training', 'do_finetune') save_finetune = config.getboolean('training', 'save_finetune') load_finetune = config.getboolean('training', 'load_finetune') load_finetune_diff = config.getboolean('training', 'load_finetune_diff') # 53 subjects, 70 utterances, 5 view angles # s[x]_v[y]_u[z].mp4 # resized, height, width = (26, 44) # ['dataMatrix', 'targetH', 'targetsPerVideoVec', 'videoLengthVec', '__header__', 'targetsVec', # '__globals__', 'iterVec', 'filenamesVec', 'dataMatrixCells', 'subjectsVec', 'targetW', '__version__'] print(data.keys()) X = data['dataMatrix'].astype('float32') y = data['targetsVec'].astype('int32') y = y.reshape((len(y),)) dct_feats = dct_data['dctFeatures'].astype('float32') uniques = np.unique(y) print('number of classifications: {}'.format(len(uniques))) subjects = data['subjectsVec'].astype('int') subjects = subjects.reshape((len(subjects),)) video_lens = data['videoLengthVec'].astype('int') video_lens = video_lens.reshape((len(video_lens,))) # X = reorder_data(X, (26, 44), 'f', 'c') # print('performing sequencewise mean image removal...') # X = sequencewise_mean_image_subtraction(X, video_lens) # visualize_images(X[550:650], (26, 44)) X_diff = compute_diff_images(X, video_lens) # mean remove dct features dct_feats = sequencewise_mean_image_subtraction(dct_feats, video_lens) train_subject_ids = read_data_split_file('data/train_val.txt') test_subject_ids = read_data_split_file('data/test.txt') print(train_subject_ids) print(test_subject_ids) train_X, train_y, train_dct, train_X_diff, train_vidlens, train_subjects, \ test_X, test_y, test_dct, test_X_diff, test_vidlens, test_subjects = \ split_data(X, y, dct_feats, X_diff, subjects, video_lens, train_subject_ids, test_subject_ids) assert train_X.shape[0] + test_X.shape[0] == len(X) assert train_y.shape[0] + test_y.shape[0] == len(y) assert train_vidlens.shape[0] + test_vidlens.shape[0] == len(video_lens) assert train_subjects.shape[0] + test_subjects.shape[0] == len(subjects) train_X = normalize_input(train_X, centralize=True) test_X = normalize_input(test_X, centralize=True) # featurewise normalize dct features train_dct, dct_mean, dct_std = featurewise_normalize_sequence(train_dct) test_dct = (test_dct - dct_mean) / dct_std if do_finetune: print('performing finetuning on pretrained encoder: {}'.format(ae_pretrained)) ae = load_dbn(ae_pretrained) ae.initialize() ae.fit(train_X, train_X) if save_finetune: print('saving finetuned encoder: {}...'.format(ae_finetuned)) pickle.dump(ae, open(ae_finetuned, 'wb')) if load_finetune: print('loading finetuned encoder: {}...'.format(ae_finetuned)) ae = pickle.load(open(ae_finetuned, 'rb')) ae.initialize() if load_finetune_diff: print('loading finetuned encoder: {}...'.format(ae_finetuned_diff)) ae_diff = pickle.load(open(ae_finetuned_diff, 'rb')) ae_diff.initialize() # IMPT: the encoder was trained with fortan ordered images, so to visualize # convert all the images to C order using reshape_images_order() # output = dbn.predict(test_X) # test_X = reshape_images_order(test_X, (26, 44)) # output = reshape_images_order(output, (26, 44)) # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44)) window = T.iscalar('theta') dct = T.tensor3('dct', dtype='float32') inputs = T.tensor3('inputs', dtype='float32') inputs_diff = T.tensor3('inputs_diff', dtype='float32') mask = T.matrix('mask', dtype='uint8') targets = T.ivector('targets') lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX), name='learning_rate') lr_decay = np.array(decay_rate, dtype=theano.config.floatX) print('constructing end to end model...') ''' network = create_end_to_end_model(dbn, (None, None, 1144), inputs, (None, None), mask, 250, window) ''' network, adascale = adenet_v5.create_model(ae, ae_diff, (None, None, 1144), inputs, (None, None), mask, (None, None, 90), dct, (None, None, 1144), inputs_diff, 250, window, 10, use_adascale) print_network(network) print('compiling model...') predictions = las.layers.get_output(network, deterministic=False) all_params = las.layers.get_all_params(network, trainable=True) cost = T.mean(las.objectives.categorical_crossentropy(predictions, targets)) updates = adadelta(cost, all_params, learning_rate=lr) # updates = adagrad(cost, all_params, learning_rate=lr) use_max_constraint = False if use_max_constraint: MAX_NORM = 4 for param in las.layers.get_all_params(network, regularizable=True): if param.ndim > 1: # only apply to dimensions larger than 1, exclude biases updates[param] = norm_constraint(param, MAX_NORM * las.utils.compute_norms(param.get_value()).mean()) train = theano.function( [inputs, targets, mask, dct, inputs_diff, window], cost, updates=updates, allow_input_downcast=True) compute_train_cost = theano.function([inputs, targets, mask, dct, inputs_diff, window], cost, allow_input_downcast=True) test_predictions = las.layers.get_output(network, deterministic=True) test_cost = T.mean(las.objectives.categorical_crossentropy(test_predictions, targets)) compute_test_cost = theano.function( [inputs, targets, mask, dct, inputs_diff, window], test_cost, allow_input_downcast=True) val_fn = theano.function([inputs, mask, dct, inputs_diff, window], test_predictions, allow_input_downcast=True) # We'll train the network with 10 epochs of 30 minibatches each print('begin training...') cost_train = [] cost_val = [] class_rate = [] NUM_EPOCHS = 30 EPOCH_SIZE = 120 BATCH_SIZE = 10 WINDOW_SIZE = 9 STRIP_SIZE = 3 MAX_LOSS = 0.2 VALIDATION_WINDOW = 4 val_window = circular_list(VALIDATION_WINDOW) train_strip = np.zeros((STRIP_SIZE,)) best_val = float('inf') best_conf = None best_cr = 0.0 datagen = gen_lstm_batch_random(train_X, train_y, train_vidlens, batchsize=BATCH_SIZE) val_datagen = gen_lstm_batch_random(test_X, test_y, test_vidlens, batchsize=len(test_vidlens)) integral_lens = compute_integral_len(train_vidlens) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val, idxs_val = next(val_datagen) integral_lens_val = compute_integral_len(test_vidlens) dct_val = gen_seq_batch_from_idx(test_dct, idxs_val, test_vidlens, integral_lens_val, np.max(test_vidlens)) X_diff_val = gen_seq_batch_from_idx(test_X_diff, idxs_val, test_vidlens, integral_lens_val, np.max(test_vidlens)) def early_stop(cost_window): if len(cost_window) < 2: return False else: curr = cost_window[0] for idx, cost in enumerate(cost_window): if curr < cost or idx == 0: curr = cost else: return False return True for epoch in range(NUM_EPOCHS): time_start = time.time() for i in range(EPOCH_SIZE): X, y, m, batch_idxs = next(datagen) d = gen_seq_batch_from_idx(train_dct, batch_idxs, train_vidlens, integral_lens, np.max(train_vidlens)) X_diff = gen_seq_batch_from_idx(train_X_diff, batch_idxs, train_vidlens, integral_lens, np.max(train_vidlens)) print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format( epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value())) print(print_str, end='') sys.stdout.flush() train(X, y, m, d, X_diff, WINDOW_SIZE) print('\r', end='') cost = compute_train_cost(X, y, m, d, X_diff, WINDOW_SIZE) val_cost = compute_test_cost(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE) cost_train.append(cost) cost_val.append(val_cost) train_strip[epoch % STRIP_SIZE] = cost val_window.push(val_cost) gl = 100 * (cost_val[-1] / np.min(cost_val) - 1) pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1) pq = gl / pk cr, val_conf = evaluate_model(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE, val_fn) class_rate.append(cr) print("Epoch {} train cost = {}, validation cost = {}, " "generalization loss = {:.3f}, GQ = {:.3f}, classification rate = {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start)) if val_cost < best_val: best_val = val_cost best_conf = val_conf best_cr = cr if use_adascale: adascale_param = las.layers.get_all_param_values(adascale, scaling_param=True) if epoch >= VALIDATION_WINDOW and early_stop(val_window): break # learning rate decay if epoch >= decay_start - 1: lr.set_value(lr.get_value() * lr_decay) phrases = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10'] print('Final Model') print('classification rate: {}, validation loss: {}'.format(best_cr, best_val)) if use_adascale: print("final scaling params: {}".format(adascale_param)) print('confusion matrix: ') plot_confusion_matrix(best_conf, phrases, fmt='grid') plot_validation_cost(cost_train, cost_val, class_rate, savefilename='valid_cost')
def construct_lstm(input_size, lstm_size, output_size, train_data_gen, val_data_gen): # All gates have initializers for the input-to-gate and hidden state-to-gate # weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity. # The convention is that gates use the standard sigmoid nonlinearity, # which is the default for the Gate class. gate_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), b=las.init.Constant(0.)) cell_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) # prepare the input layers # By setting the first and second dimensions to None, we allow # arbitrary minibatch sizes with arbitrary sequence lengths. # The number of feature dimensions is 150, as described above. l_in = InputLayer(shape=(None, None, input_size), name='input') # This input will be used to provide the network with masks. # Masks are expected to be matrices of shape (n_batch, n_time_steps); # both of these dimensions are variable for us so we will use # an input shape of (None, None) l_mask = InputLayer(shape=(None, None), name='mask') # Our LSTM will have 250 hidden/cell units N_HIDDEN = lstm_size l_lstm = LSTMLayer( l_in, N_HIDDEN, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm1') ''' # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back = LSTMLayer( l_in, N_HIDDEN, ingate=gate_parameters, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum = ElemwiseSumLayer([l_lstm, l_lstm_back]) # implement drop-out regularization l_dropout = DropoutLayer(l_sum) l_lstm2 = LSTMLayer( l_dropout, N_HIDDEN, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5.) # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back2 = LSTMLayer( l_dropout, N_HIDDEN, ingate=gate_parameters, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm_back2]) ''' # The l_forward layer creates an output of dimension (batch_size, SEQ_LENGTH, N_HIDDEN) # Since we are only interested in the final prediction, we isolate that quantity and feed it to the next layer. # The output of the sliced layer will then be of size (batch_size, N_HIDDEN) l_forward_slice = SliceLayer(l_lstm, -1, 1, name='slice') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_out = DenseLayer( l_forward_slice, num_units=output_size, nonlinearity=las.nonlinearities.softmax, name='output') print_network(l_out) # draw_to_file(las.layers.get_all_layers(l_out), 'network.png') # Symbolic variable for the target network output. # It will be of shape n_batch, because there's only 1 target value per sequence. target_values = T.ivector('target_output') # This matrix will tell the network the length of each sequences. # The actual values will be supplied by the gen_data function. mask = T.matrix('mask') # lasagne.layers.get_output produces an expression for the output of the net prediction = las.layers.get_output(l_out) # The value we care about is the final value produced for each sequence # so we simply slice it out. # predicted_values = network_output[:, -1] # Our cost will be categorical cross entropy error cost = T.mean(las.objectives.categorical_crossentropy(prediction, target_values)) # cost = T.mean((predicted_values - target_values) ** 2) # Retrieve all parameters from the network all_params = las.layers.get_all_params(l_out, trainable=True) # Compute adam updates for training # updates = las.updates.adam(cost, all_params) updates = adadelta(cost, all_params) # Theano functions for training and computing cost train = theano.function( [l_in.input_var, target_values, l_mask.input_var], cost, updates=updates, allow_input_downcast=True) compute_train_cost = theano.function( [l_in.input_var, target_values, l_mask.input_var], cost, allow_input_downcast=True) test_prediction = las.layers.get_output(l_out, deterministic=True) test_cost = T.mean(las.objectives.categorical_crossentropy(test_prediction, target_values)) compute_val_cost = theano.function([l_in.input_var, target_values, l_mask.input_var], test_cost, allow_input_downcast=True) val_fn = theano.function([l_in.input_var, l_mask.input_var], test_prediction, allow_input_downcast=True) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val = next(val_data_gen) # We'll train the network with 10 epochs of 100 minibatches each cost_train = [] cost_val = [] class_rate = [] best_val = float('inf') best_conf = None best_cr = 0.0 NUM_EPOCHS = 30 EPOCH_SIZE = 26 STRIP_SIZE = 3 MAX_LOSS = 0.05 VALIDATION_WINDOW = 4 val_window = circular_list(VALIDATION_WINDOW) train_strip = np.zeros((STRIP_SIZE,)) def early_stop(cost_window): if len(cost_window) < 2: return False else: curr = cost_window[0] for idx, cost in enumerate(cost_window): if curr < cost or idx == 0: curr = cost else: return False return True for epoch in range(NUM_EPOCHS): time_start = time.time() for _ in range(EPOCH_SIZE): X, y, m, _ = next(train_data_gen) train(X, y, m) train_cost = compute_train_cost(X, y, m) val_cost = compute_val_cost(X_val, y_val, mask_val) cr, conf = evaluate_model(X_val, y_val, mask_val, val_fn) cost_train.append(train_cost) cost_val.append(val_cost) class_rate.append(cr) train_strip[epoch % STRIP_SIZE] = train_cost val_window.push(val_cost) gl = 100 * (cost_val[-1] / np.min(cost_val) - 1) pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1) pq = gl / pk print("Epoch {} train cost = {}, validation cost = {}, " "generalization loss = {:.3f}, GQ = {:.3f}, classification rate = {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start)) if val_cost < best_val: best_val = val_cost best_cr = cr best_conf = conf if epoch >= VALIDATION_WINDOW and early_stop(val_window): break letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] print('Final Model') print('classification rate: {}'.format(best_cr)) print('validation loss: {}'.format(best_val)) print('confusion matrix: ') plot_confusion_matrix(best_conf, letters, fmt='grid') plot_validation_cost(cost_train, cost_val, class_rate)
def main(): configure_theano() config_file = 'config/separate_train.ini' print('loading config file: {}'.format(config_file)) config = ConfigParser.ConfigParser() config.read(config_file) print('preprocessing dataset...') data = load_mat_file(config.get('data', 'images')) ae_pretrained = config.get('models', 'pretrained') ae_finetuned = config.get('models', 'finetuned') learning_rate = float(config.get('training', 'learning_rate')) decay_rate = float(config.get('training', 'decay_rate')) decay_start = int(config.get('training', 'decay_start')) lstm_units = int(config.get('training', 'lstm_units')) output_units = int(config.get('training', 'output_units')) do_finetune = config.getboolean('training', 'do_finetune') save_finetune = config.getboolean('training', 'save_finetune') load_finetune = config.getboolean('training', 'load_finetune') # 53 subjects, 70 utterances, 5 view angles # s[x]_v[y]_u[z].mp4 # resized, height, width = (26, 44) # ['dataMatrix', 'targetH', 'targetsPerVideoVec', 'videoLengthVec', '__header__', 'targetsVec', # '__globals__', 'iterVec', 'filenamesVec', 'dataMatrixCells', 'subjectsVec', 'targetW', '__version__'] print(data.keys()) X = data['dataMatrix'].astype( 'float32') # .reshape((-1, 26, 44), order='f').reshape((-1, 26 * 44)) y = data['targetsVec'].astype('int32') y = y.reshape((len(y), )) uniques = np.unique(y) print('number of classifications: {}'.format(len(uniques))) subjects = data['subjectsVec'].astype('int') subjects = subjects.reshape((len(subjects), )) video_lens = data['videoLengthVec'].astype('int') video_lens = video_lens.reshape((len(video_lens, ))) train_subject_ids = read_data_split_file('data/train.txt') val_subject_ids = read_data_split_file('data/val.txt') test_subject_ids = read_data_split_file('data/test.txt') print('Train: {}'.format(train_subject_ids)) print('Validation: {}'.format(val_subject_ids)) print('Test: {}'.format(test_subject_ids)) train_X, train_y, train_vidlens, train_subjects, \ val_X, val_y, val_vidlens, val_subjects, \ test_X, test_y, test_vidlens, test_subjects = \ split_data(X, y, subjects, video_lens, train_subject_ids, val_subject_ids, test_subject_ids) assert train_X.shape[0] + val_X.shape[0] + test_X.shape[0] == len(X) assert train_y.shape[0] + val_y.shape[0] + test_y.shape[0] == len(y) assert train_vidlens.shape[0] + val_vidlens.shape[0] + test_vidlens.shape[ 0] == len(video_lens) assert train_subjects.shape[0] + val_subjects.shape[ 0] + test_subjects.shape[0] == len(subjects) train_X = normalize_input(train_X, centralize=True) test_X = normalize_input(test_X, centralize=True) if do_finetune: dbn = load_dbn(ae_pretrained) dbn.initialize() dbn.fit(train_X, train_X) recon = dbn.predict(test_X) visualize_reconstruction(reorder_data(test_X[800:864], (26, 44)), reorder_data(recon[800:864], (26, 44)), shape=(26, 44)) if save_finetune: pickle.dump(dbn, open(ae_finetuned, 'wb')) if load_finetune: print('loading pre-trained encoding layers...') dbn = pickle.load(open(ae_finetuned, 'rb')) dbn.initialize() # recon = dbn.predict(test_X) # visualize_reconstruction(reorder_data(test_X[800:864], (26, 44)), # reorder_data(recon[800:864], (26, 44)), # shape=(26, 44)) encoder = extract_encoder(dbn) train_X = encoder.predict(train_X) val_X = encoder.predict(val_X) test_X = encoder.predict(test_X) # train_X = concat_first_second_deltas(train_X, train_vidlens) # val_X = concat_first_second_deltas(val_X, val_vidlens) # test_X = concat_first_second_deltas(test_X, test_vidlens) # featurewise normalize train_X, mean, std = featurewise_normalize_sequence(train_X) val_X = (val_X - mean) / std test_X = (test_X - mean) / std # recon = dbn.predict(test_X) # visualize_reconstruction(test_X[550:650], recon[550:650], (26, 44)) # exit() # IMPT: the encoder was trained with fortan ordered images, so to visualize # convert all the images to C order using reshape_images_order() # output = dbn.predict(test_X) # test_X = reshape_images_order(test_X, (26, 44)) # output = reshape_images_order(output, (26, 44)) # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44)) inputs = T.tensor3('inputs', dtype='float32') mask = T.matrix('mask', dtype='uint8') targets = T.ivector('targets') lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX), name='learning_rate') lr_decay = np.array(decay_rate, dtype=theano.config.floatX) print('constructing lstm classifier...') network = lstm_classifier_baseline.create_model( (None, None, 50), inputs, (None, None), mask, lstm_units, output_units) print_network(network) print('compiling model...') predictions = las.layers.get_output(network, deterministic=False) all_params = las.layers.get_all_params(network, trainable=True) cost = T.mean(las.objectives.categorical_crossentropy( predictions, targets)) updates = adadelta(cost, all_params, learning_rate=lr) # updates = las.updates.apply_momentum(sgd(cost, all_params, learning_rate=lr), all_params, 0.1) use_max_constraint = False if use_max_constraint: MAX_NORM = 4 for param in las.layers.get_all_params(network, regularizable=True): if param.ndim > 1: # only apply to dimensions larger than 1, exclude biases updates[param] = norm_constraint( param, MAX_NORM * las.utils.compute_norms(param.get_value()).mean()) train = theano.function([inputs, targets, mask], cost, updates=updates, allow_input_downcast=True) compute_train_cost = theano.function([inputs, targets, mask], cost, allow_input_downcast=True) test_predictions = las.layers.get_output(network, deterministic=True) test_cost = T.mean( las.objectives.categorical_crossentropy(test_predictions, targets)) compute_test_cost = theano.function([inputs, targets, mask], test_cost, allow_input_downcast=True) val_fn = theano.function([inputs, mask], test_predictions, allow_input_downcast=True) # We'll train the network with 10 epochs of 30 minibatches each print('begin training...') cost_train = [] cost_val = [] class_rate = [] NUM_EPOCHS = 30 EPOCH_SIZE = 120 BATCH_SIZE = 10 STRIP_SIZE = 3 MAX_LOSS = 0.2 VALIDATION_WINDOW = 10 val_window = circular_list(VALIDATION_WINDOW) train_strip = np.zeros((STRIP_SIZE, )) best_val = float('inf') best_conf = None best_cr = 0.0 datagen = gen_lstm_batch_random(train_X, train_y, train_vidlens, batchsize=BATCH_SIZE) val_datagen = gen_lstm_batch_random(val_X, val_y, val_vidlens, batchsize=len(val_vidlens)) test_datagen = gen_lstm_batch_random(test_X, test_y, test_vidlens, batchsize=len(test_vidlens)) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val, _ = next(val_datagen) X_test, y_test, mask_test, _ = next(test_datagen) def early_stop(cost_window): if len(cost_window) < 2: return False else: curr = cost_window[0] for idx, cost in enumerate(cost_window): if curr < cost or idx == 0: curr = cost else: return False return True for epoch in range(NUM_EPOCHS): time_start = time.time() for i in range(EPOCH_SIZE): X, y, m, _ = next(datagen) print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format( epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value())) print(print_str, end='') sys.stdout.flush() train(X, y, m) print('\r', end='') cost = compute_train_cost(X, y, m) val_cost = compute_test_cost(X_val, y_val, mask_val) cost_train.append(cost) cost_val.append(val_cost) train_strip[epoch % STRIP_SIZE] = cost val_window.push(val_cost) gl = 100 * (cost_val[-1] / np.min(cost_val) - 1) pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1) pq = gl / pk cr, val_conf = evaluate_model(X_val, y_val, mask_val, val_fn) class_rate.append(cr) if val_cost < best_val: best_val = val_cost best_conf = val_conf best_cr = cr test_cr, test_conf = evaluate_model(X_test, y_test, mask_test, val_fn) print( "Epoch {} train cost = {}, val cost = {}, " "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f}, Test CR= {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, test_cr, time.time() - time_start)) else: print("Epoch {} train cost = {}, val cost = {}, " "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f} ({:.1f}sec)". format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start)) if epoch >= VALIDATION_WINDOW and early_stop(val_window): break # learning rate decay if epoch > decay_start: lr.set_value(lr.get_value() * lr_decay) phrases = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10'] print('Final Model') print('CR: {}, val loss: {}, Test CR: {}'.format(best_cr, best_val, test_cr)) print('confusion matrix: ') plot_confusion_matrix(test_conf, phrases, fmt='grid') plot_validation_cost(cost_train, cost_val, savefilename='valid_cost')
def model_class(ds, paths, param_arch, param_cost, param_updates, param_train): # create a log file containing the architecture configuration formatter = logging.Formatter('%(message)s') logger = logging.getLogger('log_config') if 'start_from_epoch' in param_train: name_tmp = 'config_from_epoch=%04d.log' % ( param_train['start_from_epoch']) else: name_tmp = 'config.log' path_tmp = os.path.join(paths['exp'], name_tmp) if not os.path.isfile(path_tmp): handler = logging.FileHandler( path_tmp, mode='w') # to append at the end of the file use: mode='a' else: raise Exception('[e] the log file ', name_tmp, ' already exists!') handler.setFormatter(formatter) handler.setLevel(logging.INFO) logger.addHandler(handler) logger.setLevel(logging.INFO) # input dimensions dim_desc = ds.descs_train[0].shape[1] dim_labels = ds.labels_train[0].shape[0] print(dim_labels) # architecture definition: print(("[i] architecture definition... "), end=' ') tic = time.time() if param_arch['type'] == 0: desc, patch_op, cla, net, logger = arch_class_00( dim_desc, dim_labels, param_arch, logger) elif param_arch['type'] == 1: desc, patch_op, cla, net, logger = arch_class_01( dim_desc, dim_labels, param_arch, logger) elif param_arch['type'] == 2: desc, patch_op, cla, net, logger = arch_class_02( dim_desc, dim_labels, param_arch, logger) else: raise Exception('[e] architecture not supported!') print(("%02.2fs" % (time.time() - tic))) # cost function definition: print(("[i] cost function definition... "), end=' ') tic = time.time() pred = LL.get_output(cla, deterministic=True) # in case we use dropout feat = LL.get_output(net) target = T.ivector('target') # data term if param_cost['cost_func'] == 'cross_entropy': if param_arch['non_linearity'] == 'softmax': cost_dataterm = T.mean( LO.categorical_crossentropy(pred, target) ) # in the original code we were using *.mean() instead of T.mean(*) elif param_arch['non_linearity'] == 'log_softmax': cost_dataterm = T.mean( categorical_crossentropy_logdomain(pred, target)) elif param_cost['cost_func'] == 'cross_entropy_stable': if param_arch['non_linearity'] == 'softmax': cost_dataterm = T.mean( categorical_crossentropy_stable(pred, target)) else: raise Exception( '[e] the chosen cost function is not implemented for the chosen non-linearity!' ) else: raise Exception('[e] the chosen cost function is not supported!') # classification accuracy acc = LO.categorical_accuracy(pred, target).mean() # regularization cost_reg = param_cost['mu'] * LR.regularize_network_params(cla, LR.l2) # cost function cost = cost_dataterm + cost_reg # get params params = LL.get_all_params(cla) # gradient definition grad = T.grad(cost, params) grad_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grad]), 2) print(("%02.2fs" % (time.time() - tic))) # updates definition: print(("[i] gradient updates definition... "), end=' ') tic = time.time() if param_updates['method'] == 'momentum': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1.0 else: raise Exception('[e] missing learning_rate parameter!') if param_updates.get('momentum') is not None: momentum = param_updates['momentum'] # default: 0.9 else: raise Exception('[e] missing learning_rate parameter!') updates = LU.momentum(grad, params, learning_rate, momentum) elif param_updates['method'] == 'adagrad': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1.0 else: raise Exception('[e] missing learning_rate parameter!') updates = LU.adagrad(grad, params, learning_rate) elif param_updates['method'] == 'adadelta': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1.0 else: raise Exception('[e] missing learning_rate parameter!') updates = LU.adadelta(grad, params, learning_rate) elif param_updates['method'] == 'adam': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1e-03 else: raise Exception('[e] missing learning_rate parameter!') if param_updates.get('beta1') is not None: beta1 = param_updates['beta1'] # default: 0.9 else: raise Exception('[e] missing beta1 parameter!') if param_updates.get('beta2') is not None: beta2 = param_updates['beta2'] # default: 0.999 else: raise Exception('[e] missing beta2 parameter!') if param_updates.get('epsilon') is not None: epsilon = param_updates['epsilon'] # default: 1e-08 else: raise Exception('[e] missing epsilon parameter!') updates = LU.adam(grad, params, learning_rate, beta1, beta2, epsilon) else: raise Exception('[e] updates method not supported!') print(("%02.2fs" % (time.time() - tic))) # train / test functions: funcs = dict() print(("[i] compiling function 'train'... "), end=' ') tic = time.time() funcs['train'] = theano.function( [desc.input_var, patch_op.input_var, target], [cost, cost_dataterm, cost_reg, grad_norm, acc], updates=updates, allow_input_downcast=True, on_unused_input='warn') print(("%02.2fs" % (time.time() - tic))) print(("[i] compiling function 'fwd'... "), end=' ') tic = time.time() funcs['fwd'] = theano.function( [desc.input_var, patch_op.input_var, target], [cost, grad_norm, acc], allow_input_downcast=True, on_unused_input='ignore') print(("%02.2fs" % (time.time() - tic))) print(("[i] compiling function 'pred'... "), end=' ') tic = time.time() funcs['pred'] = theano.function( [desc.input_var, patch_op.input_var, target], [pred], allow_input_downcast=True, on_unused_input='ignore') print(("%02.2fs" % (time.time() - tic))) print(("[i] compiling function 'feat'... "), end=' ') tic = time.time() funcs['feat'] = theano.function( [desc.input_var, patch_op.input_var, target], [feat], allow_input_downcast=True, on_unused_input='ignore') print(("%02.2fs" % (time.time() - tic))) # save cost function parameters to a config file logger.info('\nCost function parameters:') logger.info(' cost function = %s' % param_cost['cost_func']) logger.info(' mu = %e' % param_cost['mu']) # save updates parameters to a config file logger.info('\nUpdates parameters:') logger.info(' method = %s' % param_updates['method']) logger.info(' learning rate = %e' % param_updates['learning_rate']) if param_updates['method'] == 'momentum': logger.info(' momentum = %e' % param_updates['momentum']) if param_updates['method'] == 'adam': logger.info(' beta1 = %e' % param_updates['beta1']) logger.info(' beta2 = %e' % param_updates['beta2']) logger.info(' epsilon = %e' % param_updates['epsilon']) # save training parameters to a config file logger.info('\nTraining parameters:') logger.info(' epoch size = %d' % ds.epoch_size) return funcs, cla, updates
def __init__(self, retina_model, seeder_model, n_seeds, n_steps, n_units=100, normalization_coefs=None, loss_coefs=None, alpha=1.0, threshold=1.0): self.seeder_model = seeder_model self.n_seeds = n_seeds self.n_steps = n_steps self.threshold = threshold self.retina = retina_model event_shareds = retina_model.get_event_variables() self.seeder = self.seeder_model(retina_model) if normalization_coefs is None: normalization_coefs = np.ones(shape=retina_model.model_nparams, dtype='float32') else: normalization_coefs = np.array(normalization_coefs, dtype='float32') ### params + sigma self.inputs = retina_model.alloc_model_params() self.input_layer, self.out_layer, self.reg = self.build_nn( retina_model.model_nparams, n_units=n_units) print 'Linking to Retina Model' iterations = [self.inputs] responses = [] for i in xrange(self.n_steps): print 'Iteration %d' % i prev = iterations[i] r, grads = retina_model.grad_for(*event_shareds + prev) normed_params = [p * c for p, c in zip(prev, normalization_coefs)] normed_grads = [g * c for g, c in zip(grads, normalization_coefs)] out = self.get_update_for(normed_params, r, normed_grads) param_updates = [out[:, i] for i in range(len(self.inputs))] track_param_updates, sigma_update = param_updates[: -1], param_updates[ -1] ### sigma (last parameter) is updated simply by replacing ### previous variable update = [ var + upd * alpha for var, upd in zip(prev[:-1], track_param_updates) ] + [T.exp(-sigma_update)] for var, upd, new in zip(prev[:-1], track_param_updates, update): print ' -', new, '=', var, '+ %.2e' % alpha, upd iterations.append(update) responses.append(r) prediction = iterations[-1] sigma_train = T.fscalar('sigma_train') ### Except sigma self.true_parameters_shareds = [ theano.shared(np.ndarray(shape=(0, ), dtype='float32'), name=name) for name in retina_model.model_params_names[:-1] ] ### predictions without sigma print 'Constucting loss:' print ' - Loss coefs:', loss_coefs print ' - True params shared:', self.true_parameters_shareds print ' - Predictions:', prediction[:-1] print ' - Sigma:', sigma_train pure_response, rmse = retina_model.parameter_response( loss_coefs, *self.true_parameters_shareds + prediction[:-1] + [sigma_train]) pure_loss = 1.0 - pure_response initial_response, initial_rmse = retina_model.parameter_response( loss_coefs, *self.true_parameters_shareds + self.inputs[:-1] + [sigma_train]) initial_loss = 1.0 - initial_response reg_c = T.fscalar('reg_c') alpha_rmse = T.fscalar('reg_c') loss = (1.0 - alpha_rmse) * pure_loss + alpha_rmse * rmse + reg_c * self.reg params = layers.get_all_params(self.out_layer) learning_rate = T.fscalar('learning rate') net_updates = updates.adadelta(loss, params, learning_rate=learning_rate) self._train = theano.function( self.inputs + [sigma_train, learning_rate, reg_c, alpha_rmse], [pure_loss, rmse, self.reg, loss, initial_loss, initial_rmse], updates=net_updates) self._loss = theano.function(self.inputs + [sigma_train], pure_loss) outputs = [v for it in iterations for v in it] self.ndim = len(self.inputs) self.predictions = theano.function(self.inputs, responses + outputs) self.responses = None self.traces = None self.seeds = None
w_L3 = T.sum(alphas_L3[:, :, :, None, None] * basis_L3[None, None, :, :, :], axis=2) w_L4 = init_weights((3136, 10)) #------------------------- # Set up function #------------------------- noise_l1, noise_l2, noise_l3, noise_py_x = model(X, w_L1, w_L2, w_L3, w_L4, 0.2, 0.7) l1, l2, l3, py_x = model(X, w_L1, w_L2, w_L3, w_L4, 0., 0.) y_x = T.argmax(py_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y)) params = [alphas_L1, alphas_L2, alphas_L3, w_L4] updates = adadelta(cost, params, learning_rate=lr, rho=0.95, epsilon=1e-6) train = theano.function(inputs=[X, Y, lr], outputs=cost, updates=updates, allow_input_downcast=True) predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True) #------------------------- # Train model #------------------------- d = {} batch_size = 25 epochs = args.epochs[0] epoch_count = np.array(args.epochs).astype(int)
def build_treatment_model(self, n_vars, **kwargs): input_vars = TT.matrix() instrument_vars = TT.matrix() targets = TT.vector() inputs = layers.InputLayer((None, n_vars), input_vars) inputs = layers.DropoutLayer(inputs, p=0.2) dense_layer = layers.DenseLayer(inputs, 2 * kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) dense_layer = layers.DropoutLayer(dense_layer, p=0.2) for _ in xrange(kwargs['n_dense_layers'] - 1): dense_layer = layers.DenseLayer( dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) self.treatment_output = layers.DenseLayer( dense_layer, 1, nonlinearity=nonlinearities.linear) init_params = layers.get_all_param_values(self.treatment_output) prediction = layers.get_output(self.treatment_output, deterministic=False) test_prediction = layers.get_output(self.treatment_output, deterministic=True) l2_cost = regularization.regularize_network_params( self.treatment_output, regularization.l2) loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost params = layers.get_all_params(self.treatment_output, trainable=True) param_updates = updates.adadelta(loss, params) self._train_fn = theano.function([ input_vars, targets, instrument_vars, ], loss, updates=param_updates) self._loss_fn = theano.function( [ input_vars, targets, instrument_vars, ], loss, ) self._output_fn = theano.function( [ input_vars, ], test_prediction, ) return init_params
def __init__(self, full_length, output_size, meta_size, depth=2, encoder_size=64, decoder_size=64): latent_size = 16 input_var = TT.tensor3(dtype='float32') meta_var = TT.tensor3(dtype='float32') target_var = TT.matrix() cut_weights = TT.vector(dtype='float32') input_layer = layers.InputLayer((None, None, output_size), input_var=input_var) meta_layer = layers.InputLayer((None, None, meta_size), input_var=meta_var) meta_layer = layers.DropoutLayer(meta_layer, p=0.2) concat_input_layer = layers.ConcatLayer([input_layer, meta_layer], axis=-1) # encoder lstm_layer = layers.RecurrentLayer(concat_input_layer, encoder_size / 2, learn_init=True) lstm_layer = layers.RecurrentLayer(lstm_layer, encoder_size / 2, learn_init=True) lstm_layer = layers.ReshapeLayer(lstm_layer, (-1, encoder_size / 2)) encoded = layers.DenseLayer(lstm_layer, latent_size) encoded = layers.batch_norm(encoded) dense = encoded for idx in xrange(depth): dense = layers.DenseLayer(dense, decoder_size) dense = layers.batch_norm(dense) mu_and_logvar_x_layer = layers.DenseLayer(dense, full_length * 2, nonlinearity=nonlinearities.linear) mu_x_layer = layers.SliceLayer(mu_and_logvar_x_layer, slice(0, full_length), axis=1) mu_x_layer = layers.ReshapeLayer(mu_x_layer, (-1, full_length, full_length)) logvar_x_layer = layers.SliceLayer(mu_and_logvar_x_layer, slice(full_length, None), axis=1) logvar_x_layer = layers.ReshapeLayer(logvar_x_layer, (-1, full_length, full_length)) l2_norm = regularization.regularize_network_params(mu_and_logvar_x_layer, regularization.l2) loss = neg_log_likelihood( target_var, layers.get_output(mu_x_layer, deterministic=False), layers.get_output(logvar_x_layer, deterministic=False), cut_weights ) + 1e-4 * l2_norm test_loss = neg_log_likelihood( target_var, layers.get_output(mu_x_layer, deterministic=False), layers.get_output(logvar_x_layer, deterministic=False), cut_weights ) + 1e-4 * l2_norm params = layers.get_all_params(mu_and_logvar_x_layer, trainable=True) param_updates = updates.adadelta(loss.mean(), params) self._train_fn = theano.function( [input_var, meta_var, target_var, cut_weights], updates=param_updates, outputs=loss.mean() ) self._loss_fn = theano.function( [input_var, meta_var, target_var, cut_weights], outputs=test_loss.mean() ) self._predict_fn = theano.function( [input_var, meta_var], outputs=[ layers.get_output(mu_x_layer, deterministic=True), layers.get_output(logvar_x_layer, deterministic=True) ] )