def test_rmsprop(): a = shared_floatx([3, 4]) cost = (a ** 2).sum() step_rule = RMSProp(learning_rate=0.1, decay_rate=0.5, max_scaling=1e5) steps, updates = step_rule.compute_steps( OrderedDict([(a, tensor.grad(cost, a))])) f = theano.function([], [steps[a]], updates=updates) assert_allclose(f()[0], [0.141421356, 0.141421356]) a.set_value([2, 3]) assert_allclose(f()[0], [0.09701425, 0.102899151]) a.set_value([1, 1.5]) assert_allclose(f()[0], [0.06172134, 0.064699664])
def test_rmsprop(): a = shared_floatx([3, 4]) cost = (a**2).sum() step_rule = RMSProp(learning_rate=0.1, decay_rate=0.5, max_scaling=1e5) steps, updates = step_rule.compute_steps( OrderedDict([(a, tensor.grad(cost, a))])) f = theano.function([], [steps[a]], updates=updates) assert_allclose(f()[0], [0.141421356, 0.141421356]) a.set_value([2, 3]) assert_allclose(f()[0], [0.09701425, 0.102899151]) a.set_value([1, 1.5]) assert_allclose(f()[0], [0.06172134, 0.064699664])
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) if name == 'adam': adam = Adam(learning_rate=learning_rate) step_rule = CompositeRule([adam, clipping]) learning_rate = adam.learning_rate elif name == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) step_rule = CompositeRule([clipping, rms_prop]) learning_rate = rms_prop.learning_rate elif name == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) step_rule = CompositeRule([clipping, sgd_momentum]) learning_rate = sgd_momentum.learning_rate elif name == 'sgd': sgd = Scale(learning_rate=learning_rate) step_rule = CompositeRule([clipping, sgd]) learning_rate = sgd.learning_rate else: raise NotImplementedError return step_rule, learning_rate
def prepare_opti(cost, test): model = Model(cost) algorithm = GradientDescent( cost=cost, parameters=model.parameters, step_rule=RMSProp(), on_unused_sources='ignore' ) extensions = [ FinishAfter(after_n_epochs=nb_epoch), FinishIfNoImprovementAfter(notification_name='test_cross_entropy', epochs=patience), TrainingDataMonitoring( [algorithm.cost], prefix="train", after_epoch=True), DataStreamMonitoring( [algorithm.cost], test_stream, prefix="test"), Printing(), ProgressBar(), #Checkpoint(path, after_epoch=True) ] if resume: print "Restoring from previous breakpoint" extensions.extend([ Load(path) ]) return model, algorithm, extensions
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping if name == 'adam': adam = Adam(learning_rate=learning_rate) step_rule = adam elif name == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate, decay_rate=0.9) step_rule = CompositeRule([StepClipping(1.), rms_prop]) else: sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) step_rule = sgd_momentum return step_rule
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping if name == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) step_rule = CompositeRule([adam, clipping]) elif name == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) step_rule = CompositeRule([clipping, rms_prop]) else: clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) step_rule = CompositeRule([clipping, sgd_momentum]) return step_rule
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping if name == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) # [adam, clipping] means 'step clipping' # [clipping, adam] means 'gradient clipping' step_rule = CompositeRule([adam, clipping]) elif name == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, rms_prop, rm_non_finite]) else: clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite]) return step_rule
def learning_algorithm(learning_rate, momentum=0.0, clipping_threshold=100, algorithm='sgd'): if algorithm == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) # [adam, clipping] means 'step clipping' # [clipping, adam] means 'gradient clipping' step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, rms_prop, rm_non_finite]) elif algorithm == 'sgd': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite]) else: raise NotImplementedError return step_rule
def train(self, data_file, output_data_file, n_epochs=0): training_data = dataset.T_H5PYDataset(data_file, which_sets=('train',)) test_data = dataset.T_H5PYDataset(data_file, which_sets=('test',)) session = Session(root_url='http://localhost:5006') if self.MainLoop is None: step_rules = [RMSProp(learning_rate=0.2, decay_rate=0.95), StepClipping(1)] algorithm = GradientDescent(cost=self.Cost, parameters=self.ComputationGraph.parameters, step_rule=CompositeRule(step_rules), on_unused_sources='ignore') train_stream = DataStream.default_stream( training_data, iteration_scheme=SequentialScheme( training_data.num_examples, batch_size=100)) test_stream = DataStream.default_stream( test_data, iteration_scheme=SequentialScheme( test_data.num_examples, batch_size=100)) self.MainLoop = MainLoop( model=Model(self.Cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=n_epochs), Printing(), Checkpoint(output_data_file, every_n_epochs=50), TrainingDataMonitoring([self.Cost], after_batch=True, prefix='train'), DataStreamMonitoring([self.Cost], after_batch=True, data_stream=test_stream, prefix='test'), Plot(output_data_file, channels=[['train_cost', 'test_cost']]) ]) self.MainLoop.run()
def setup_algorithms(cost, cg, method, type="ff"): """Setup training algorithm. Parameters ---------- cost : expression cost expression cg : ComputationGraph Computation graph method : string training method: SGD, momentum SGD, AdaGrad, RMSprop learning_rate : float learning rate for learning method Returns ------- algorithm : GradientDescent Gradient Descent algorithm based on different optimization method """ if method == "sgd": step_rule = Scale(learning_rate=0.01) elif method == "momentum": step_rule = Momentum(learning_rate=0.01, momentum=0.95) elif method == "adagrad": step_rule = AdaGrad() elif method == "rmsprop": step_rule = RMSProp() if type == "RNN": step_rule = CompositeRule([StepClipping(1.0), step_rule]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) return algorithm
if args.continue_from: from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses, sequence_length) #graph, extension, update = construct_graphs(args, nclasses, sequence_length) ### optimization algorithm definition step_rule = CompositeRule([ StepClipping(1.), #Momentum(learning_rate=args.learning_rate, momentum=0.9), RMSProp(learning_rate=args.learning_rate, decay_rate=0.5), ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor (after epoch to limit the log size) step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ])
def main(): nclasses = 27 import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=1) parser.add_argument("--length", type=int, default=180) parser.add_argument("--num-epochs", type=int, default=100) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=1e-3) parser.add_argument("--epsilon", type=float, default=1e-5) parser.add_argument("--num-hidden", type=int, default=1000) parser.add_argument("--baseline", action="store_true") parser.add_argument("--initialization", choices="identity glorot orthogonal uniform".split(), default="identity") parser.add_argument("--initial-gamma", type=float, default=1e-1) parser.add_argument("--initial-beta", type=float, default=0) parser.add_argument("--cluster", action="store_true") parser.add_argument("--activation", choices=list(activations.keys()), default="tanh") parser.add_argument("--optimizer", choices="sgdmomentum adam rmsprop", default="rmsprop") parser.add_argument("--continue-from") parser.add_argument("--evaluate") parser.add_argument("--dump-hiddens") args = parser.parse_args() np.random.seed(args.seed) blocks.config.config.default_seed = args.seed if args.continue_from: from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses) ### optimization algorithm definition if args.optimizer == "adam": optimizer = Adam(learning_rate=args.learning_rate) elif args.optimizer == "rmsprop": optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9) elif args.optimizer == "sgdmomentum": optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99) step_rule = CompositeRule([ StepClipping(1.), optimizer, ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ]) step_channels.append( algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append( algorithm.total_gradient_norm.copy(name="total_gradient_norm")) step_channels.extend(graphs["training"].outputs) logger.warning("constructing training data monitor") extensions.append( TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True)) # parameter monitor extensions.append( DataStreamMonitoring([ param.norm(2).copy(name="parameter.norm:%s" % name) for name, param in model.get_parameter_dict().items() ], data_stream=None, after_epoch=True)) validation_interval = 500 # performance monitor for situation in "training inference".split(): if situation == "inference" and not args.evaluate: # save time when we don't need the inference graph continue for which_set in "train valid test".split(): logger.warning("constructing %s %s monitor" % (which_set, situation)) channels = list(graphs[situation].outputs) extensions.append( DataStreamMonitoring(channels, prefix="%s_%s" % (which_set, situation), every_n_batches=validation_interval, data_stream=get_stream( which_set=which_set, batch_size=args.batch_size, num_examples=10000, length=args.length))) extensions.extend([ TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"), DumpBest("best_valid_training_error_rate", "best.zip"), FinishAfter(after_n_epochs=args.num_epochs), #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50), Checkpoint("checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True), DumpLog("log.pkl", after_epoch=True) ]) if not args.cluster: extensions.append(ProgressBar()) extensions.extend([ Timing(), Printing(every_n_batches=validation_interval), PrintingTo("log"), ]) main_loop = MainLoop(data_stream=get_stream(which_set="train", batch_size=args.batch_size, length=args.length, augment=True), algorithm=algorithm, extensions=extensions, model=model) if args.dump_hiddens: dump_hiddens(args, main_loop) return if args.evaluate: evaluate(args, main_loop) return main_loop.run()
softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules), on_unused_sources='ignore') from blocks.extensions import Timing, FinishAfter, Printing, ProgressBar from blocks.extensions.monitoring import TrainingDataMonitoring from fuel.streams import DataStream from fuel.schemes import SequentialScheme from blocks.main_loop import MainLoop from blocks.extensions.saveload import Checkpoint from blocks.model import Model
def train(self): x = self.sharedBatch['x'] x.name = 'x_myinput' xmini = self.sharedBatch['xmini'] xmini.name = 'xmini_myinput' y = self.sharedBatch['y'] y.name = 'y_myinput' # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(self.input_dimx, self.dim, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) xmini_to_h = Linear(self.input_dimxmini, self.mini_dim, name='xmini_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) rnnwmini = RNNwMini(dim=self.dim, mini_dim=self.mini_dim, summary_dim=self.summary_dim) h_to_o = Linear(self.summary_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) xmini_transform = xmini_to_h.apply(xmini) h = rnnwmini.apply(x=x_transform, xmini=xmini_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) #y_hat = Logistic().apply(y_hat) cost = SquaredError().apply(y, y_hat) cost.name = 'cost' rnnwmini.initialize() x_to_h.initialize() xmini_to_h.initialize() h_to_o.initialize() self.f = theano.function(inputs=[], outputs=y_hat) #print("self.f === ") #print(self.f()) #print(self.f().shape) #print("====") self.cg = ComputationGraph(cost) m = Model(cost) algorithm = GradientDescent(cost=cost, parameters=self.cg.parameters, step_rule=RMSProp(learning_rate=0.01), on_unused_sources='ignore') valid_monitor = DataStreamMonitoringShared( variables=[cost], data_stream=self.stream_valid_int, prefix="valid", sharedBatch=self.sharedBatch, sharedData=self.sharedData) train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) sharedVarMonitor = SwitchSharedReferences(self.sharedBatch, self.sharedData) tBest = self.track_best('valid_cost', self.cg) self.tracker = tBest[0] extensions = [sharedVarMonitor, valid_monitor] + tBest if self.debug: extensions.append(Printing()) self.algorithm = algorithm self.extensions = extensions self.model = m self.mainloop = MainLoop(self.algorithm, self.stream_train_int, extensions=self.extensions, model=self.model) self.main_loop(True)
if args.continue_from: from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses) ### optimization algorithm definition if args.optimizer == "adam": optimizer = Adam(learning_rate=args.learning_rate) # zzzz optimizer.learning_rate = theano.shared(np.asarray(optimizer.learning_rate, dtype=theano.config.floatX)) elif args.optimizer == "rmsprop": optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9) elif args.optimizer == "sgdmomentum": optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99) step_rule = CompositeRule([ StepClipping(1.), optimizer, ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] extensions.append(SharedVariableModifier( optimizer.learning_rate,
n_entities = 550 embed_size = 200 ctx_lstm_size = [256] ctx_skip_connections = True question_lstm_size = [256] question_skip_connections = True attention_mlp_hidden = [100] attention_mlp_activations = [Tanh()] out_mlp_hidden = [] out_mlp_activations = [] step_rule = CompositeRule([ RMSProp(decay_rate=0.95, learning_rate=5e-5), BasicMomentum(momentum=0.9) ]) dropout = 0.2 w_noise = 0. valid_freq = 1000 save_freq = 1000 print_freq = 100 weights_init = IsotropicGaussian(0.01) biases_init = Constant(0.)
train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 256)) test_batch = int(config.get('hyperparams', 'valid_batch', 256)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) W_b = float(config.get('hyperparams', 'W_b', 0.01)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') if 'adagrad' in solver: solver_type = AdaGrad() else: solver_type = RMSProp(learning_rate=base_lr) pre_trained_folder = '../models/' input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427} train = H5PYDataset(data_file, which_set='train', sources=['l_features', 'r_features']) valid = H5PYDataset(data_file, which_set='valid', sources=['l_features', 'r_features']) test = H5PYDataset(data_file, which_set='test', sources=['l_features', 'r_features']) x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) # Define a feed-forward net with an input, two hidden layers, and a softmax output: autoencoder = MLP(activations=[ #Rectifier(name='h1'), Rectifier(name='h1'),
gen_params = gen_filter(cg.variables) dis_params = dis_filter(cg.variables) gan.dis_params = dis_params gan.gen_params = gen_params # print y_hat1.eval(test_data) # print y_hat0.eval(test_data) # raise algo = AdverserialTraning(gen_obj=gen_obj, dis_obj=dis_obj, model=gan, dis_iter=1, step_rule=RMSProp(learning_rate=1e-4), gen_consider_constant=z) neg_sample = gan.sampling(size=25) from blocks.monitoring.aggregation import minimum monitor = TrainingDataMonitoring( variables=[minimum(gen_obj), minimum(dis_obj)], prefix="train", after_batch=True) subdir = './exp/' + 'mnist' + "-" + time.strftime("%Y%m%d-%H%M%S") check_point = Checkpoint("{}/{}".format(subdir, 'mnist'), every_n_epochs=50, save_separately=['log', 'model'])
# MODEL x = tensor.matrix('features', dtype='uint8') y = tensor.matrix('targets', dtype='uint8') y_hat, cost, cells = nn_fprop(x, y, vocab_size, hidden_size, num_layers, model) # COST cg = ComputationGraph(cost) if dropout > 0: # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015) inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [dev_monitor, train_monitor, Timing(), Printing(after_batch=True),
y_hat, cost, cells = nn_fprop(x, y, in_size, out_size, hidden_size, num_recurrent_layers, train_flag) # COST cg = ComputationGraph(cost) extra_updates = [] # Learning optimizer if training_optimizer == 'Adam': step_rules = [ Adam(learning_rate=learning_rate), StepClipping(step_clipping) ] # , VariableClipping(threshold=max_norm_threshold) elif training_optimizer == 'RMSProp': step_rules = [ RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping) ] elif training_optimizer == 'Adagrad': step_rules = [ AdaGrad(learning_rate=learning_rate), StepClipping(step_clipping) ] elif training_optimizer == 'Adadelta': step_rules = [AdaDelta(decay_rate=decay_rate), StepClipping(step_clipping)] parameters_to_update = cg.parameters algorithm = GradientDescent(cost=cg.outputs[0], parameters=parameters_to_update, step_rule=CompositeRule(step_rules)) algorithm.add_updates(extra_updates)
input_dim = 1 out_dim = 1 hidden_dim = 64 activation_function = Tanh() activation_function_name = 'Tanh' batch_size = 100 w_noise_std = 0.01 i_dropout = 0.5 proportion_train = 0.9 algo = 'RMS' learning_rate_value = 1e-5 momentum_value = 0.9 decay_rate_value = 0 StepClipping_value = 2 step_rule = CompositeRule([RMSProp(learning_rate=learning_rate_value), #decay_rate=decay_rate_value, BasicMomentum(momentum=momentum_value), StepClipping(StepClipping_value)]) print_freq = 1000 valid_freq = 10000 save_freq = 10000 class Model(): def __init__(self): inp = tensor.tensor3('input') inp = inp.dimshuffle(1,0,2) target = tensor.matrix('target') target = target.reshape((target.shape[0],)) product = tensor.lvector('product') missing = tensor.eq(inp, 0) train_input_mean = 1470614.1
if not os.path.exists(save_path): os.makedirs(save_path) log_path = save_path + '/log.txt' fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) print 'Bulding training process...' model = Model(cost) params = ComputationGraph(cost).parameters print_num_params(params) clipping = StepClipping(threshold=np.cast[floatX](1.0)) # Momentum(learning_rate=args.learning_rate, momentum=0.9) rm_non_finite = RemoveNotFinite() rms_prop = RMSProp(learning_rate=1e-3, decay_rate=0.5) step_rule = CompositeRule([clipping, rms_prop, rm_non_finite]) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=step_rule) # train_stream, valid_stream = get_seq_mnist_streams( # h_dim, batch_size, update_prob) train_stream = get_stream('train', batch_size, h_dim, False) train_stream_evaluation = get_stream('train', batch_size, h_dim, True) valid_stream = get_stream('valid', batch_size, h_dim, True) if load_path: with open(load_path + '/trained_params_best.npz') as f: loaded = np.load(f)
shuffle_entities = True concat_ctx_and_question = False concat_question_before = False embed_size = 200 ctx_lstm_size = [256, 256] ctx_skip_connections = False question_lstm_size = [256] question_skip_connections = True attention_mlp_hidden = [200] attention_mlp_activations = [Tanh()] step_rule = CompositeRule([RMSProp(decay_rate=0.95, learning_rate=5e-5), BasicMomentum(momentum=0.9)]) dropout = 0.2 w_noise = 0. valid_freq = 10000 save_freq = 10000 print_freq = 1000 weights_init = IsotropicGaussian(0.01) biases_init = Constant(0.) transition_weights_init = Orthogonal()
batch_size = 32 sort_batch_count = 20 shuffle_questions = True concat_ctx_and_question = True concat_question_before = True embed_size = 200 lstm_size = [256, 256] skip_connections = True out_mlp_hidden = [] out_mlp_activations = [] step_rule = CompositeRule([ RMSProp(decay_rate=0.95, learning_rate=1e-4), BasicMomentum(momentum=0.9) ]) dropout = 0.1 valid_freq = 1000 save_freq = 1000 print_freq = 100 weights_init = IsotropicGaussian(0.01) biases_init = Constant(0.)
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError( 'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)' ) z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, ) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 4, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d' % l, ogates_zoneout=ogates_zoneout) for l in range(num_layers) ] elif rnn_type.lower() == 'gru': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 3, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d' % l) for l in range(num_layers) ] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d' % l) for l in range(num_layers) ] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply( rnn_embedding, zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost / np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states' % l in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states' % l: norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x * x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [ cost_train, cost, bpc, perp, learning_rate, aggregation.mean( algorithm.total_gradient_norm).copy("gradient_norm_mean") ] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append( RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value() / lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError( 'Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
def main(nvis, nhid, encoding_lstm_dim, decoding_lstm_dim, T=1): x = tensor.matrix('features') # Construct and initialize model encoding_mlp = MLP([Tanh()], [None, None]) decoding_mlp = MLP([Tanh()], [None, None]) encoding_lstm = LSTM(dim=encoding_lstm_dim) decoding_lstm = LSTM(dim=decoding_lstm_dim) draw = DRAW(nvis=nvis, nhid=nhid, T=T, encoding_mlp=encoding_mlp, decoding_mlp=decoding_mlp, encoding_lstm=encoding_lstm, decoding_lstm=decoding_lstm, biases_init=Constant(0), weights_init=Orthogonal()) draw.push_initialization_config() encoding_lstm.weights_init = IsotropicGaussian(std=0.001) decoding_lstm.weights_init = IsotropicGaussian(std=0.001) draw.initialize() # Compute cost cost = -draw.log_likelihood_lower_bound(x).mean() cost.name = 'nll_upper_bound' model = Model(cost) # Datasets and data streams mnist_train = BinarizedMNIST('train') train_loop_stream = ForceFloatX( DataStream(dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 100))) train_monitor_stream = ForceFloatX( DataStream(dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 500))) mnist_valid = BinarizedMNIST('valid') valid_monitor_stream = ForceFloatX( DataStream(dataset=mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500))) mnist_test = BinarizedMNIST('test') test_monitor_stream = ForceFloatX( DataStream(dataset=mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 500))) # Get parameters and monitoring channels computation_graph = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(computation_graph.variables) monitoring_channels = dict([ ('avg_' + channel.tag.name, channel.mean()) for channel in VariableFilter( name='.*term$')(computation_graph.auxiliary_variables) ]) for name, channel in monitoring_channels.items(): channel.name = name monitored_quantities = monitoring_channels.values() + [cost] # Training loop step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) algorithm.add_updates(computation_graph.updates) main_loop = MainLoop( model=model, data_stream=train_loop_stream, algorithm=algorithm, extensions=[ Timing(), SerializeMainLoop('vae.pkl', save_separately=['model']), FinishAfter(after_n_epochs=200), DataStreamMonitoring(monitored_quantities, train_monitor_stream, prefix="train", updates=computation_graph.updates), DataStreamMonitoring(monitored_quantities, valid_monitor_stream, prefix="valid", updates=computation_graph.updates), DataStreamMonitoring(monitored_quantities, test_monitor_stream, prefix="test", updates=computation_graph.updates), ProgressBar(), Printing() ]) main_loop.run()
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float( config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427} data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent(step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_{}'.format(side), channels=[ ['dropout_entropy', 'validation_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}net/{}'.format(side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.1, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def main(args): """Run experiment. """ lr_tag = float_tag(args.learning_rate) x_dim, train_stream, valid_stream, test_stream = datasets.get_streams( args.data, args.batch_size) #------------------------------------------------------------ # Setup model deterministic_act = Tanh deterministic_size = 1. if args.method == 'vae': sizes_tag = args.layer_spec.replace(",", "-") layer_sizes = [int(i) for i in args.layer_spec.split(",")] layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1] name = "%s-%s-%s-lr%s-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag) if args.activation == "tanh": hidden_act = Tanh() elif args.activation == "logistic": hidden_act = Logistic() elif args.activation == "relu": hidden_act = Rectifier() else: raise "Unknown hidden nonlinearity %s" % args.hidden_act model = VAE(x_dim=x_dim, hidden_layers=layer_sizes, hidden_act=hidden_act, z_dim=z_dim, batch_norm=args.batch_normalization) model.initialize() elif args.method == 'dvae': sizes_tag = args.layer_spec.replace(",", "-") layer_sizes = [int(i) for i in args.layer_spec.split(",")] layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1] name = "%s-%s-%s-lr%s-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag) if args.activation == "tanh": hidden_act = Tanh() elif args.activation == "logistic": hidden_act = Logistic() elif args.activation == "relu": hidden_act = Rectifier() else: raise "Unknown hidden nonlinearity %s" % args.hidden_act model = DVAE(x_dim=x_dim, hidden_layers=layer_sizes, hidden_act=hidden_act, z_dim=z_dim, batch_norm=args.batch_normalization) model.initialize() elif args.method == 'rws': sizes_tag = args.layer_spec.replace(",", "-") qbase = "" if not args.no_qbaseline else "noqb-" name = "%s-%s-%s-%slr%s-dl%d-spl%d-%s" % \ (args.data, args.method, args.name, qbase, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag) p_layers, q_layers = create_layers(args.layer_spec, x_dim, args.deterministic_layers, deterministic_act, deterministic_size) model = ReweightedWakeSleep( p_layers, q_layers, qbaseline=(not args.no_qbaseline), ) model.initialize() elif args.method == 'bihm-rws': sizes_tag = args.layer_spec.replace(",", "-") name = "%s-%s-%s-lr%s-dl%d-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag) p_layers, q_layers = create_layers(args.layer_spec, x_dim, args.deterministic_layers, deterministic_act, deterministic_size) model = BiHM( p_layers, q_layers, l1reg=args.l1reg, l2reg=args.l2reg, ) model.initialize() elif args.method == 'continue': import cPickle as pickle from os.path import basename, splitext with open(args.model_file, 'rb') as f: m = pickle.load(f) if isinstance(m, MainLoop): m = m.model model = m.get_top_bricks()[0] while len(model.parents) > 0: model = model.parents[0] assert isinstance(model, (BiHM, ReweightedWakeSleep, VAE)) mname, _, _ = basename(args.model_file).rpartition("_model.pkl") name = "%s-cont-%s-lr%s-spl%s" % (mname, args.name, lr_tag, args.n_samples) else: raise ValueError("Unknown training method '%s'" % args.method) #------------------------------------------------------------ x = tensor.matrix('features') #------------------------------------------------------------ # Testset monitoring train_monitors = [] valid_monitors = [] test_monitors = [] for s in [1, 10, 100, 1000]: log_p, log_ph = model.log_likelihood(x, s) log_p = -log_p.mean() log_ph = -log_ph.mean() log_p.name = "log_p_%d" % s log_ph.name = "log_ph_%d" % s #train_monitors += [log_p, log_ph] #valid_monitors += [log_p, log_ph] test_monitors += [log_p, log_ph] #------------------------------------------------------------ # Z estimation #for s in [100000]: # z2 = tensor.exp(model.estimate_log_z2(s)) / s # z2.name = "z2_%d" % s # # valid_monitors += [z2] # test_monitors += [z2] #------------------------------------------------------------ # Gradient and training monitoring if args.method in ['vae', 'dvae']: log_p_bound, gradients = model.get_gradients(x, args.n_samples) log_p_bound = -log_p_bound.mean() log_p_bound.name = "log_p_bound" cost = log_p_bound train_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] valid_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] test_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] else: log_p, log_ph, gradients = model.get_gradients(x, args.n_samples) log_p = -log_p.mean() log_ph = -log_ph.mean() log_p.name = "log_p" log_ph.name = "log_ph" cost = log_ph train_monitors += [log_p, log_ph] valid_monitors += [log_p, log_ph] #------------------------------------------------------------ # Detailed monitoring """ n_layers = len(p_layers) log_px, w, log_p, log_q, samples = model.log_likelihood(x, n_samples) exp_samples = [] for l in xrange(n_layers): e = (w.dimshuffle(0, 1, 'x')*samples[l]).sum(axis=1) e.name = "inference_h%d" % l e.tag.aggregation_scheme = aggregation.TakeLast(e) exp_samples.append(e) s1 = samples[1] sh1 = s1.shape s1_ = s1.reshape([sh1[0]*sh1[1], sh1[2]]) s0, _ = model.p_layers[0].sample_expected(s1_) s0 = s0.reshape([sh1[0], sh1[1], s0.shape[1]]) s0 = (w.dimshuffle(0, 1, 'x')*s0).sum(axis=1) s0.name = "inference_h0^" s0.tag.aggregation_scheme = aggregation.TakeLast(s0) exp_samples.append(s0) # Draw P-samples p_samples, _, _ = model.sample_p(100) #weights = model.importance_weights(samples) #weights = weights / weights.sum() for i, s in enumerate(p_samples): s.name = "psamples_h%d" % i s.tag.aggregation_scheme = aggregation.TakeLast(s) # samples = model.sample(100, oversample=100) for i, s in enumerate(samples): s.name = "samples_h%d" % i s.tag.aggregation_scheme = aggregation.TakeLast(s) """ cg = ComputationGraph([cost]) #------------------------------------------------------------ if args.step_rule == "momentum": step_rule = Momentum(args.learning_rate, 0.95) elif args.step_rule == "rmsprop": step_rule = RMSProp(args.learning_rate) elif args.step_rule == "adam": step_rule = Adam(args.learning_rate) else: raise "Unknown step_rule %s" % args.step_rule #parameters = cg.parameters[:4] + cg.parameters[5:] parameters = cg.parameters algorithm = GradientDescent( cost=cost, parameters=parameters, gradients=gradients, step_rule=CompositeRule([ #StepClipping(25), step_rule, #RemoveNotFinite(1.0), ])) #------------------------------------------------------------ train_monitors += [ aggregation.mean(algorithm.total_gradient_norm), aggregation.mean(algorithm.total_step_norm) ] #------------------------------------------------------------ # Live plotting? plotting_extensions = [] if args.live_plotting: plotting_extensions = [ PlotManager( name, [ Plotter(channels=[[ "valid_%s" % cost.name, "valid_log_p" ], ["train_total_gradient_norm", "train_total_step_norm"]], titles=[ "validation cost", "norm of training gradient and step" ]), DisplayImage( [ WeightDisplay(model.p_layers[0].mlp. linear_transformations[0].W, n_weights=100, image_shape=(28, 28)) ] #ImageDataStreamDisplay(test_stream, image_shape=(28,28))] ) ]) ] main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), ProgressBar(), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), DataStreamMonitoring( valid_monitors, data_stream=valid_stream, prefix="valid"), DataStreamMonitoring(test_monitors, data_stream=test_stream, prefix="test", after_epoch=False, after_training=True, every_n_epochs=10), #SharedVariableModifier( # algorithm.step_rule.components[0].learning_rate, # half_lr_func, # before_training=False, # after_epoch=False, # after_batch=False, # every_n_epochs=half_lr), TrackTheBest('valid_%s' % cost.name), Checkpoint(name + ".pkl", save_separately=['log', 'model']), FinishIfNoImprovementAfter('valid_%s_best_so_far' % cost.name, epochs=args.patience), FinishAfter(after_n_epochs=args.max_epochs), Printing() ] + plotting_extensions) main_loop.run()
from blocks.initialization import IsotropicGaussian, Constant from blocks.filter import VariableFilter from blocks.roles import WEIGHT from blocks.graph import ComputationGraph, apply_noise, apply_dropout from datastream import RandomTransposeIt step_rule_name = 'adam' learning_rate = 0.1 momentum = 0.9 if step_rule_name == 'adadelta': step_rule = AdaDelta() elif step_rule_name == 'rmsprop': step_rule = RMSProp() elif step_rule_name == 'momentum': step_rule_name = "mom%s,%s" % (repr(learning_rate), repr(momentum)) step_rule = Momentum(learning_rate=learning_rate, momentum=momentum) elif step_rule_name == 'adam': step_rule = Adam() else: raise ValueError("No such step rule: " + step_rule_name) ibatchsize = None iter_scheme = RandomTransposeIt(ibatchsize, False, None, False) valid_iter_scheme = RandomTransposeIt(ibatchsize, False, None, False) w_noise_std = 0.05 r_dropout = 0.0 s_dropout = 0.0
def test_rmsprop_broadcastable(): verify_broadcastable_handling(RMSProp(0.1, 0.5, 1e5))
def train(args, trial=11, no_valid=False): # Creating unique strings to save for experiments. data_valid = "data/"+args.data_name+"_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) data_test = data_valid.replace("_valid_size", "_test_size") # If we want validation set to match modData of test set if modDataValid == 1: data_valid = data_valid.replace("_trial_", "_" + modData + "_trial_") data_test = data_test.replace("_trial_", "_" + modData + "_trial_") # By default, it is m0 data_train = "data/"+args.data_name+"_trial_"+str(trial)+"_train_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) subStr = "rnn_type_"+args.rnn_type + "_trial_"+str(trial) + "_hiddenSize_"+str(args.hidden_size)+\ "_numLayers_"+str(args.num_layers)+ \ "_dropout_"+str(args.dropout)+"_train_size_"+str(args.train_size) + "_transitions_"+str(args.transitions)+\ "_novalid_"+str(args.no_valid) if modData == "m1": data_train = data_train.replace("_trial_", "_m1_trial_") subStr = subStr.replace("_trial_", "_m1_trial_") elif modData == "m3": data_train = data_train.replace("_trial_", "_m3_trial_") subStr = subStr.replace("_trial_", "_m3_trial_") data_valid = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) data_test = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_test_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) print("on test: " + subStr) # Perform folder prefixing prefix_path = models_folder + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\ "_boost_"+bStr(args.boosting) load_path2 = prefix + load_path save_path2 = prefix + save_path last_path2 = prefix + last_path plots_output2 = plots_output + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\ "_boost_"+bStr(args.boosting) # obtain vocabulary size ix_to_char, char_to_ix, vocab_size = get_metadata( data_test.replace("_test", "")) print("vocab_size: " + str(vocab_size)) # Get train, valid, test streams sharedDataTrain, train_stream = get_stream_inGPU(data_train, sharedName='sharedData') train_streamCopy = copy.deepcopy(train_stream) sharedDataValid, dev_stream = get_stream_inGPU(data_valid, sharedName='sharedData') valid_streamCopy = copy.deepcopy(dev_stream) sharedDataTest, test_stream = get_stream_inGPU(data_test, sharedName='sharedData') test_streamCopy = copy.deepcopy(test_stream) # Create dummy sums sharedMRRSUM = shared(np.array(0.0, dtype=theano.config.floatX)) sharedTOTSUM = shared(np.array(0.0, dtype=theano.config.floatX)) sharedSUMVARs = { 'sharedMRRSUM': sharedMRRSUM, 'sharedTOTSUM': sharedTOTSUM } # Initialize batches batch_index_From = T.scalar('int_stream_From', dtype='int32') batch_index_To = T.scalar('int_stream_To', dtype='int32') # Index theano variables x = sharedDataTrain['x'][:, batch_index_From:batch_index_To] x.name = 'x' x_mask = sharedDataTrain['x_mask'][:, batch_index_From:batch_index_To] x_mask.name = 'x_mask' x_mask_o = sharedDataTrain['x_mask_o'][:, batch_index_From:batch_index_To] x_mask_o.name = 'x_mask_o' x_mask_o_mask = sharedDataTrain[ 'x_mask_o_mask'][:, batch_index_From:batch_index_To] x_mask_o_mask.name = 'x_mask_o_mask' y = sharedDataTrain['y'][:, batch_index_From:batch_index_To] y.name = 'y' y_mask = sharedDataTrain['y_mask'][:, batch_index_From:batch_index_To] y_mask.name = 'y_mask' y_mask_o = sharedDataTrain['y_mask_o'][:, batch_index_From:batch_index_To] y_mask_o.name = 'y_mask_o' y_mask_o_mask = sharedDataTrain[ 'y_mask_o_mask'][:, batch_index_From:batch_index_To] y_mask_o_mask.name = 'y_mask_o_mask' lens = sharedDataTrain['lens'][:, batch_index_From:batch_index_To] lens.name = 'lens' # Generate temp shared vars tempSharedData = {} tempSharedData[theano.config.floatX] = [ shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)) ] tempSharedData['uint8'] = [ shared(np.array([[0], [0]], dtype='uint8')), shared(np.array([[0], [0]], dtype='uint8')), shared(np.array([[0], [0]], dtype='uint8')) ] # Final mask is due to the generated mask and the input mask x_mask_final = x_mask * x_mask_o * x_mask_o_mask y_mask_final = y_mask * y_mask_o * y_mask_o_mask # Build neural network linear_output, cost = nn_fprop( x, x_mask_final, y, y_mask_final, lens, vocab_size, hidden_size, num_layers, rnn_type, boosting=boosting, scan_kwargs={'truncate_gradient': truncate_gradient}) # Keep a constant in gpu memory constant1 = shared(np.float32(1.0)) cost_int, ymasksum = RR_cost(y, linear_output, y_mask_final, constant1) # Validation calculations fRR = function(inputs=[ theano.In(batch_index_From, borrow=True), theano.In(batch_index_To, borrow=True) ], updates=[(sharedMRRSUM, sharedMRRSUM + cost_int), (sharedTOTSUM, sharedTOTSUM + ymasksum)]) # COST cg = ComputationGraph(cost) if dropout > 0: # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015) inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')( cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [ RMSProp(learning_rate=rmsPropLearnRate, decay_rate=decay_rate), StepClipping(step_clipping) ] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions # This is for tracking our best result trackbest = track_best('valid_MRR', save_path2, last_path2, num_epochs, nepochs, maxIterations, epsilon, tempSharedData) if onlyPlots: prefixes = ["train_cross", "valid_cross", "test_cross"] gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] #this is faster train_monitor = myTrainingDataMonitoring( variables=monitored_vars, prefix=prefixes[0], after_batch=True, saveEveryXIteration=saveEveryXIteration) #train_monitor = DataStreamMonitoringPlot(variables=[cost], # data_stream=train_streamCopy, prefix=prefixes[0], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTrain, after_batch=True, saveEveryXIteration = saveEveryXIteration) valid_monitor = DataStreamMonitoringPlot( variables=[cost], data_stream=valid_streamCopy, prefix=prefixes[1], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataValid, after_batch=True, saveEveryXIteration=saveEveryXIteration) test_monitor = DataStreamMonitoringPlot( variables=[cost], data_stream=test_streamCopy, prefix=prefixes[2], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTest, after_batch=True, saveEveryXIteration=saveEveryXIteration) trackbest = [trackbest[0], trackbest[2], trackbest[3], trackbest[4]] plot = Plot('Live Plotting', saveFolder=plots_output2, channels=[ 'train_cross_cost', 'valid_cross_cost', 'test_cross_cost' ], numProcesses=numProcesses, saveEveryXIteration=saveEveryXIteration, after_batch=True) extensions = [ train_monitor, valid_monitor, test_monitor, plot, Printing(), ProgressBar(), ] + trackbest else: dev_monitor = myDataStreamMonitoring(after_epoch=True, before_epoch=False, data_stream=dev_stream, prefix="valid", fRR=fRR, sharedVars=sharedSUMVARs, sharedDataTrain=sharedDataTrain, sharedDataValid=sharedDataValid) extensions = [ dev_monitor, Printing(), ProgressBar(), ] + trackbest if learning_rate_decay not in (0, 1): extensions.append( SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: np.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=True, after_batch=False)) print 'number of parameters in the model: ' + str( T.sum([p.size for p in cg.parameters]).eval()) # Finally build the main loop and train the model main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()