def Adam(cost, params, lr=0.002, b1=0.2, b2=0.001, e=1e-8): decay_factor = 1-e updates=[] grads=T.grad(cost, params) i = shared_floatx(0.,"adam_t") i_t = i+1 updates.append((i,i_t)) lr = (lr *T.sqrt((1. - (1. - b2)**i_t)) / (1. - (1. - b1)**i_t)) b1_t = 1 - (1 - b1) * decay_factor ** (i_t - 1) updates_init=[] for p,g in zip(params, grads): m = shared_floatx(p.get_value() * 0., "adam_m_"+p.name) v = shared_floatx(p.get_value() *0., "adam_v_"+p.name) m_t = b1_t*g + (1-b1_t)*g v_t = b2*T.sqr(g) + (1-b2)*v g_t = m_t/(T.sqrt(v_t)+e) updates.append((m,m_t)) updates.append((v,v_t)) updates.append((p, p-lr*g_t)) updates_init.append((m, 0*m)) updates_init.append((v, 0*v)) return updates, updates_init
def test_perclass_accuracy_monitor(): features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) datastream = DataStream(dataset) label_i_to_c = {0:"a", 1:"b", 2:"c"} test_probs = shared_floatx(numpy.array([ [0.0, 0.0, 1.0], [0.75, 0.25, 0.0], [0.0, 0.75, 0.25], [0.25, 0.75, 0.0], ], dtype=floatX)) targets = shared_floatx(numpy.array([ [2.0], [0.0], [1.0], [2.0] ], dtype=floatX)) perclass_accuracy_monitor = PerClassAccuracyMonitor(datastream, prediction=numpy.argmax(test_probs, axis=1), targets=targets.ravel(), label_i_to_c=label_i_to_c) perclass_accuracy_monitor.main_loop = setup_mainloop([]) perclass_accuracy_monitor.do('after_batch') assert perclass_accuracy_monitor.main_loop.log[0]['perclass accuracy_a']==1.0 assert perclass_accuracy_monitor.main_loop.log[0]['perclass accuracy_b']==1.0 assert perclass_accuracy_monitor.main_loop.log[0]['perclass accuracy_c']==0.5
def compute_step(self, parameter, previous_step): mean_square_step_tm1 = shared_floatx(parameter.get_value() * 0., "mean_square_step_tm1") add_role(mean_square_step_tm1, ALGORITHM_BUFFER) mean_square_delta_x_tm1 = shared_floatx(parameter.get_value() * 0., "mean_square_delta_x_tm1") add_role(mean_square_delta_x_tm1, ALGORITHM_BUFFER) mean_square_step_t = ( self.decay_rate * mean_square_step_tm1 + (1 - self.decay_rate) * tensor.sqr(previous_step) ) rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon) rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon) delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step mean_square_delta_x_t = ( self.decay_rate * mean_square_delta_x_tm1 + (1 - self.decay_rate) * tensor.sqr(delta_x_t) ) step = delta_x_t updates = [(mean_square_step_tm1, mean_square_step_t), (mean_square_delta_x_tm1, mean_square_delta_x_t)] return step, updates
def test_perclass_accuracy_monitor(): features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) datastream = DataStream(dataset) label_i_to_c = {0: "a", 1: "b", 2: "c"} test_probs = shared_floatx( numpy.array([ [0.0, 0.0, 1.0], [0.75, 0.25, 0.0], [0.0, 0.75, 0.25], [0.25, 0.75, 0.0], ], dtype=floatX)) targets = shared_floatx( numpy.array([[2.0], [0.0], [1.0], [2.0]], dtype=floatX)) perclass_accuracy_monitor = PerClassAccuracyMonitor( datastream, prediction=numpy.argmax(test_probs, axis=1), targets=targets.ravel(), label_i_to_c=label_i_to_c) perclass_accuracy_monitor.main_loop = setup_mainloop([]) perclass_accuracy_monitor.do('after_batch') assert perclass_accuracy_monitor.main_loop.log[0][ 'perclass accuracy_a'] == 1.0 assert perclass_accuracy_monitor.main_loop.log[0][ 'perclass accuracy_b'] == 1.0 assert perclass_accuracy_monitor.main_loop.log[0][ 'perclass accuracy_c'] == 0.5
def test_training_data_monitoring(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [ numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]] ] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') V = shared_floatx(7, name='V') W_sum = named_copy(W.sum(), 'W_sum') cost = ((x * W).sum() - y)**2 cost.name = 'cost' class TrueCostExtension(TrainingExtension): def before_batch(self, data): self.main_loop.log.current_row['true_cost'] = (( (W.get_value() * data["features"]).sum() - data["targets"])**2) main_loop = MainLoop(model=None, data_stream=dataset.get_example_stream(), algorithm=GradientDescent(cost=cost, parameters=[W], step_rule=Scale(0.001)), extensions=[ FinishAfter(after_n_epochs=1), TrainingDataMonitoring([W_sum, cost, V], prefix="train1", after_batch=True), TrainingDataMonitoring( [aggregation.mean(W_sum), cost], prefix="train2", after_epoch=True), TrueCostExtension() ]) main_loop.run() # Check monitoring of a shared varible assert_allclose(main_loop.log.current_row['train1_V'], 7.0) for i in range(n_batches): # The ground truth is written to the log before the batch is # processed, where as the extension writes after the batch is # processed. This is why the iteration numbers differs here. assert_allclose(main_loop.log[i]['true_cost'], main_loop.log[i + 1]['train1_cost']) assert_allclose( main_loop.log[n_batches]['train2_cost'], sum([main_loop.log[i]['true_cost'] for i in range(n_batches)]) / n_batches) assert_allclose( main_loop.log[n_batches]['train2_W_sum'], sum([ main_loop.log[i]['train1_W_sum'] for i in range(1, n_batches + 1) ]) / n_batches)
def test_gradient_descent_updates_keyword(): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) z = shared_floatx(5) algorithm = GradientDescent(gradients=OrderedDict([(W, W/2)]), updates=[(z, z + 1)]) assert len(algorithm.updates) == 2 assert z in dict(algorithm.updates)
def plot_energy_surface(model): from mpl_toolkits.mplot3d import Axes3D from matplotlib import cm from matplotlib.ticker import LinearLocator, FormatStrFormatter import matplotlib.pyplot as plt (x1, x2) = numpy.meshgrid(numpy.arange(-0.5, 0.5, 0.05), numpy.arange(-0.5, 0.5, 0.05)) x = shared_floatx(numpy.vstack((x1.flatten(), x2.flatten())).T) h = shared_floatx(numpy.zeros((x.get_value().shape[0], model.nhid))) map_f = theano.function([], updates=OrderedDict([(h, model.map_update(x, h))])) energy_f = theano.function([], [model.energy(x, h)]) for i in range(100): map_f() (E_,) = energy_f() E_ = E_.reshape(x1.shape) fig = plt.figure() ax = fig.gca(projection="3d") surf = ax.plot_surface(x1, x2, E_, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) ax.set_zlim(numpy.min(E_), numpy.max(E_)) ax.zaxis.set_major_locator(LinearLocator(10)) ax.zaxis.set_major_formatter(FormatStrFormatter("%.02f")) fig.colorbar(surf, shrink=0.5, aspect=5) plt.show() plt.savefig("E.png")
def __init__(self, decay_rate=0.95, epsilon=1e-6): if not 0.0 <= decay_rate <= 1.0: raise ValueError("decay rate needs to be in [0, 1]") self.decay_rate = shared_floatx(decay_rate, "decay_rate") add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) self.epsilon = shared_floatx(epsilon, "epsilon") add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
def test_gradient_descent_updates_keyword(): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) z = shared_floatx(5) algorithm = GradientDescent(gradients=OrderedDict([(W, W / 2)]), updates=[(z, z + 1)]) assert len(algorithm.updates) == 2 assert z in dict(algorithm.updates)
def test_confusion_matrix(): features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) datastream = DataStream(dataset) label_i_to_c = {0: "a", 1: "b", 2: "c"} test_probs = shared_floatx( numpy.array([[0.75, 0.0, 0.0], [0.75, 0.0, 0.0], [0.0, 0.0, 0.75], [0.0, 0.0, 0.75], [0.75, 0.0, 0.0], [0.0, 0.0, 0.75]], dtype=floatX)) targets = shared_floatx( numpy.array([[2.0], [0.0], [2.0], [2.0], [0.0], [1.0]], dtype=floatX)) d = DirectoryCreator(directory="confusionMatrixTest") extension = ConfusionMatrixMonitor(datastream, prediction=numpy.argmax(test_probs, axis=1), targets=targets.ravel(), dest_directory="confusionMatrixTest", every_n_batches=3) main_loop = setup_mainloop([d, extension]) main_loop.run() path = 'confusionMatrixTest/confusion_iterations_3.npz' expected = numpy.array( [[1.0, 0.0, 0.0], [0.0, 0.0, 1.0], [(1.0 / 3.0), 0.0, (2.0 / 3.0)]], dtype=floatX) assert_allclose(numpy.load(path), expected) shutil.rmtree('confusionMatrixTest')
def test_step_clipping_no_threshold_regression(): """Test regression for #1145, incorrect output when threshold=None.""" rule1 = StepClipping() assert rule1.threshold is None gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)} clipped1, updates = rule1.compute_steps(gradients) assert len(updates) == 0 assert clipped1 == gradients
def test_gradient_descent_finds_inputs_additional_updates(): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) n = shared_floatx(1) m = tensor.scalar('m') algorithm = GradientDescent(gradients=OrderedDict([(W, W + 1)])) algorithm.add_updates([(n, n + m)]) algorithm.initialize() assert m in algorithm.inputs
def __init__(self, learning_rate=0.002, beta1=0.1, beta2=0.001, epsilon=1e-8, decay_factor=(1 - 1e-8)): self.learning_rate = shared_floatx(learning_rate, "learning_rate") self.beta1 = shared_floatx(beta1, "beta1") self.beta2 = shared_floatx(beta2, "beta2") self.epsilon = shared_floatx(epsilon, "epsilon") self.decay_factor = shared_floatx(decay_factor, "decay_factor") for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon, self.decay_factor]: add_role(param, ALGORITHM_HYPERPARAMETER)
def test_gradient_descent_non_match_parameters_gradients_not_ordered(): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) z = shared_floatx(5) assert_raises_regex(ValueError, "fixed order", GradientDescent, parameters=[z], gradients={W: 2 * W})
def __init__(self, eta=0, gamma=0.55, seed=180891): self.eta_sqrt = shared_floatx(sqrt(eta), "eta") add_role(self.eta_sqrt, ALGORITHM_HYPERPARAMETER) self.gamma_half = shared_floatx(gamma/2, "gamma") add_role(self.gamma_half, ALGORITHM_HYPERPARAMETER) self.theano_random = rng_mrg.MRG_RandomStreams(seed=seed)
def test_step_clipping(): rule1 = StepClipping(4) rule2 = StepClipping(5) gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)} clipped1, _ = rule1.compute_steps(gradients) assert_allclose(clipped1[0].eval(), 12 / 5.0) assert_allclose(clipped1[1].eval(), 16 / 5.0) clipped2, _ = rule2.compute_steps(gradients) assert_allclose(clipped2[0].eval(), 3.0) assert_allclose(clipped2[1].eval(), 4.0)
def test_remove_not_finite(): rule1 = RemoveNotFinite() rule2 = RemoveNotFinite(1.) gradients = {1: shared_floatx(numpy.nan), 2: shared_floatx(numpy.inf)} rval1, _ = rule1.compute_steps(gradients) assert_allclose(rval1[1].eval(), 0.1) assert_allclose(rval1[2].eval(), 0.2) rval2, _ = rule2.compute_steps(gradients) assert_allclose(rval2[1].eval(), 1.0) assert_allclose(rval2[2].eval(), 2.0)
def test_training_data_monitoring(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') V = shared_floatx(7, name='V') W_sum = named_copy(W.sum(), 'W_sum') cost = ((x * W).sum() - y) ** 2 cost.name = 'cost' class TrueCostExtension(TrainingExtension): def before_batch(self, data): self.main_loop.log.current_row['true_cost'] = ( ((W.get_value() * data["features"]).sum() - data["targets"]) ** 2) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=GradientDescent(cost=cost, params=[W], step_rule=Scale(0.001)), extensions=[ FinishAfter(after_n_epochs=1), TrainingDataMonitoring([W_sum, cost, V], prefix="train1", after_batch=True), TrainingDataMonitoring([aggregation.mean(W_sum), cost], prefix="train2", after_epoch=True), TrueCostExtension()]) main_loop.run() # Check monitoring of a shared varible assert_allclose(main_loop.log.current_row['train1_V'], 7.0) for i in range(n_batches): # The ground truth is written to the log before the batch is # processed, where as the extension writes after the batch is # processed. This is why the iteration numbers differs here. assert_allclose(main_loop.log[i]['true_cost'], main_loop.log[i + 1]['train1_cost']) assert_allclose( main_loop.log[n_batches]['train2_cost'], sum([main_loop.log[i]['true_cost'] for i in range(n_batches)]) / n_batches) assert_allclose( main_loop.log[n_batches]['train2_W_sum'], sum([main_loop.log[i]['train1_W_sum'] for i in range(1, n_batches + 1)]) / n_batches)
def __init__(self, D_params, D_kind, momentum=0.): self.momentum = shared_floatx(momentum) # dictionary of velocities self.velocities = OrderedDict() self.D_kind = {} for p_name in D_params: param_i = D_params[p_name] velocity = shared_floatx(param_i.get_value() * 0.) velocity.name = p_name+ "_momentum" self.velocities[velocity.name] = velocity self.D_kind[velocity.name] = D_kind[p_name]
def __init__(self, learning_rate=0.002, mu1=0.99, nu2=0.999, epsilon=1e-8, decay_prod=(1.)): self.learning_rate = shared_floatx(learning_rate, "learning_rate") self.mu1 = shared_floatx(mu1, "mu1") self.nu2 = shared_floatx(nu2, "nu2") self.epsilon = shared_floatx(epsilon, "epsilon") self.decay_prod = shared_floatx(decay_prod, "decay_prod") for param in [self.learning_rate, self.mu1, self.nu2, self.epsilon, self.decay_prod]: add_role(param, ALGORITHM_HYPERPARAMETER)
def test_updates_algorithm_add_updates(): n = shared_floatx(1) m = shared_floatx(0) algorithm = UpdatesAlgorithm(updates=[(n, n + 1)]) algorithm.add_updates([(m, n % 2)]) assert len(algorithm.updates) == 2 algorithm.initialize() algorithm.process_batch({}) assert_allclose(n.get_value(), 2) assert_allclose(m.get_value(), 1) algorithm.process_batch({}) assert_allclose(n.get_value(), 3) assert_allclose(m.get_value(), 0)
def __init__( self, initial_threshold=1.0, stdevs=4, decay=0.96, clip_to_mean=True, quick_variance_convergence=True, **kwargs ): super(AdaptiveStepClipping, self).__init__(**kwargs) self.gnorm_log_ave = shared_floatx(numpy.log(initial_threshold), name="gnorm_log_ave") self.gnorm_log2_ave = shared_floatx(0, name="gnorm_log2_ave") self.adapt_steps = shared_floatx(0, name="adapt_steps") self.clip_threshold = shared_floatx(numpy.nan, name="clip_threshold") self.clip_level = shared_floatx(numpy.nan, name="clip_level") self.decay = decay self.stdevs = stdevs self.clip_to_mean = clip_to_mean self.quick_variance_convergence = quick_variance_convergence
def testing(self, fea2obj): config = self._config dsdir = config['dsdir'] devfile = dsdir + '/dev.txt' testfile = dsdir + '/test.txt' networkfile = config['net'] batch_size = 10000#int(config['batchsize']) devMentions = load_ent_ds(devfile) tstMentions = load_ent_ds(testfile) logger.info('#dev: %d #test: %d', len(devMentions), len(tstMentions)) main_loop = load(networkfile + '.best.pkl') logger.info('Model loaded. Building prediction function...') old_model = main_loop.model logger.info(old_model.inputs) sources = [inp.name for inp in old_model.inputs] # fea2obj = build_input_objs(sources, config) t2idx = fea2obj['targets'].t2idx deterministic = str_to_bool(config['use_mean_pred']) if 'use_mean_pred' in config else True kl_weight = shared_floatx(0.001, 'kl_weight') entropy_weight= shared_floatx(0.001, 'entropy_weight') cost, _, y_hat, _, _,_,_ = build_model_new(fea2obj, len(t2idx), self._config, kl_weight, entropy_weight, deterministic=deterministic, test=True) model = Model(cost) model.set_parameter_values(old_model.get_parameter_values()) theinputs = [] for fe in fea2obj.keys(): if 'targets' in fe: continue for inp in model.inputs: if inp.name == fe: theinputs.append(inp) # theinputs = [inp for inp in model.inputs if inp.name != 'targets'] print "theinputs: ", theinputs predict = theano.function(theinputs, y_hat) test_stream, num_samples_test = get_comb_stream(fea2obj, 'test', batch_size, shuffle=False) dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size, shuffle=False) logger.info('sources: %s -- number of test/dev samples: %d/%d', test_stream.sources, num_samples_test, num_samples_dev) idx2type = {idx:t for t,idx in t2idx.iteritems()} logger.info('Starting to apply on dev inputs...') self.applypredict(theinputs, predict, dev_stream, devMentions, num_samples_dev, batch_size, os.path.join(config['exp_dir'], config['matrixdev']), idx2type) logger.info('...apply on dev data finished') logger.info('Starting to apply on test inputs...') self.applypredict(theinputs, predict, test_stream, tstMentions, num_samples_test, batch_size, os.path.join(config['exp_dir'], config['matrixtest']), idx2type) logger.info('...apply on test data finished')
def __init__(self, initial_threshold=1.0, stdevs=4, decay=0.96, clip_to_mean=True, quick_variance_convergence=True, **kwargs): super(AdaptiveStepClipping, self).__init__(**kwargs) self.gnorm_log_ave = shared_floatx(numpy.log(initial_threshold), name='gnorm_log_ave') self.gnorm_log2_ave = shared_floatx(0, name='gnorm_log2_ave') self.adapt_steps = shared_floatx(0, name='adapt_steps') self.clip_threshold = shared_floatx(numpy.nan, name='clip_threshold') self.clip_level = shared_floatx(numpy.nan, name='clip_level') self.decay = decay self.stdevs = stdevs self.clip_to_mean = clip_to_mean self.quick_variance_convergence = quick_variance_convergence
def __init__(self, D_params, D_kind, decay_rate=0.9, max_scaling=1e5): if not 0.0 <= decay_rate <= 1.0: raise ValueError("decay rate needs to be in [0, 1]") if max_scaling <= 0: raise ValueError("max. scaling needs to be greater than 0") self.decay_rate = shared_floatx(decay_rate) self.epsilon = 1. / max_scaling self.velocities = OrderedDict() self.D_kind = {} for p_name in D_params: param_i = D_params[p_name] velocity = shared_floatx(param_i.get_value() * 0.) velocity.name = p_name+ "_decay" self.velocities[velocity.name] = velocity self.D_kind[velocity.name] = D_kind[p_name]
def __init__(self, threshold, axis=None): axis = pack(axis) if axis is not None else () self.axis = set(axis) self.threshold = shared_floatx(threshold, "threshold") add_role(self.threshold, ALGORITHM_HYPERPARAMETER) if len(axis) != len(self.axis): raise ValueError("axis must be unique")
def __init__(self, decay_rate=0.9, max_scaling=1e5): if not 0.0 <= decay_rate <= 1.0: raise ValueError("decay rate needs to be in [0, 1]") if max_scaling <= 0: raise ValueError("max. scaling needs to be greater than 0") self.decay_rate = shared_floatx(decay_rate) self.epsilon = 1. / max_scaling
def test_shared_variable_modifier_two_params(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') cost = ((x * W).sum() - y) ** 2 cost.name = 'cost' step_rule = Scale(0.001) sgd = GradientDescent(cost=cost, params=[W], step_rule=step_rule) modifier = SharedVariableModifier( step_rule.learning_rate, lambda _, val: numpy.cast[theano.config.floatX](val * 0.2)) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=sgd, extensions=[FinishAfter(after_n_epochs=1), modifier]) main_loop.run() new_value = step_rule.learning_rate.get_value() assert_allclose(new_value, 0.001 * 0.2 ** n_batches, atol=1e-5)
def setup_mainloop(extension): """Set up a simple main loop for progress bar tests. Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) W = shared_floatx([0, 0], name='W') x = tensor.vector('features') cost = tensor.sum((x-W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, parameters=[W], step_rule=Scale(1e-3)) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=1), extension]) return main_loop
def compute_step(self, parameter, previous_step): velocity = shared_floatx(parameter.get_value() * 0.) velocity_update = self.momentum*velocity + previous_step step = (self.momentum**2 * velocity + previous_step * (1 + self.momentum)) updates = [(velocity, velocity_update)] return step, updates
def test_linear_decay(): lr = shared_floatx(100.0) decay = LinearDecay(lr, 1.0) assert_allclose(decay.compute_value(0.0), 100.0) assert_allclose(decay.compute_value(50), 50.0) assert_allclose(decay.compute_value(100), 0.0) assert_allclose(decay.compute_value(200), 0.0)
def test_polynomial_decay(): lr = shared_floatx(100.0) decay = PolynomialDecay(lr,100.0,1.0) assert_allclose( decay.compute_value(0), 100.0) assert_allclose( decay.compute_value(50), 50.0) assert_allclose( decay.compute_value(100), 0.0) assert_allclose( decay.compute_value(200), 0.0)
def test_graph_inputs(): a = tensor.matrix('a') b = shared_floatx(0, 'b') c = 3 d = a + b + c assert graph_inputs([d]) == [a]
def setup_mainloop(extension, iteration_scheme=None): """Set up a simple main loop for progress bar tests. Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ # Since progressbar2 3.6.0, the `maxval` kwarg has been replaced by # `max_value`, which has a default value of 100. If we're still using # `maxval` by accident, this test should fail complaining that # the progress bar has received a value out of range. features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2]] * 101] dataset = IterableDataset(dict(features=features)) data_stream = DataStream(dataset, iteration_scheme=iteration_scheme) W = shared_floatx([0, 0], name='W') x = tensor.vector('features') cost = tensor.sum((x-W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, parameters=[W], step_rule=Scale(1e-3)) main_loop = MainLoop( model=None, data_stream=data_stream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=1), extension]) return main_loop
def test_linear_decay(): lr = shared_floatx(100.0) decay = LinearDecay(lr,1.0) assert_allclose( decay.compute_value(0.0), 100.0) assert_allclose( decay.compute_value(50), 50.0) assert_allclose( decay.compute_value(100), 0.0) assert_allclose( decay.compute_value(200), 0.0)
def setup_mainloop(extension): """Set up a simple main loop for progress bar tests. Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ # Since progressbar2 3.6.0, the `maxval` kwarg has been replaced by # `max_value`, which has a default value of 100. If we're still using # `maxval` by accident, this test should fail complaining that # the progress bar has received a value out of range. features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2]] * 101] dataset = IterableDataset(dict(features=features)) W = shared_floatx([0, 0], name='W') x = tensor.vector('features') cost = tensor.sum((x-W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, parameters=[W], step_rule=Scale(1e-3)) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=1), extension]) return main_loop
def test_polynomial_decay(): lr = shared_floatx(100.0) decay = PolynomialDecay(lr, 100.0, 1.0) assert_allclose(decay.compute_value(0), 100.0) assert_allclose(decay.compute_value(50), 50.0) assert_allclose(decay.compute_value(100), 0.0) assert_allclose(decay.compute_value(200), 0.0)
def test_shared_variable_modifier(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') cost = ((x * W).sum() - y) ** 2 cost.name = 'cost' step_rule = Scale(0.001) sgd = GradientDescent(cost=cost, parameters=[W], step_rule=step_rule) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=sgd, extensions=[ FinishAfter(after_n_epochs=1), SharedVariableModifier( step_rule.learning_rate, lambda n: numpy.cast[theano.config.floatX](10. / n) )]) main_loop.run() assert_allclose(step_rule.learning_rate.get_value(), numpy.cast[theano.config.floatX](10. / n_batches))
def __init__(self, decay=0.95, gamma_clip=0.0, grad_clip=None, start_var_reduction=0, delta_clip=25, gamma_reg=1e-6, slow_decay=0.995, use_adagrad=True, perform_update=True, skip_nan_inf=False, use_corrected_grad=True): assert decay >= 0. assert decay < 1. self.start_var_reduction = start_var_reduction self.delta_clip = delta_clip self.gamma_clip = gamma_clip self.grad_clip = grad_clip self.slow_decay = slow_decay self.decay = shared_floatx(decay, "decay") self.use_corrected_grad = use_corrected_grad self.use_adagrad = use_adagrad self.gamma_reg = gamma_reg self.damping = 1e-7 self.perform_update = perform_update # We have to bound the tau to prevent it to # grow to an arbitrarily large number, oftenwise # that causes numerical instabilities for very deep # networks. Note that once tau become very large, it will keep, # increasing indefinitely. self.skip_nan_inf = skip_nan_inf self.upper_bound_tau = 1e7 self.lower_bound_tau = 1.5
def test_shared_variable_modifier(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector("features") y = tensor.scalar("targets") W = shared_floatx([0, 0], name="W") cost = ((x * W).sum() - y) ** 2 cost.name = "cost" step_rule = Scale(0.001) sgd = GradientDescent(cost=cost, parameters=[W], step_rule=step_rule) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=sgd, extensions=[ FinishAfter(after_n_epochs=1), SharedVariableModifier(step_rule.learning_rate, lambda n: numpy.cast[theano.config.floatX](10.0 / n)), ], ) main_loop.run() assert_allclose(step_rule.learning_rate.get_value(), numpy.cast[theano.config.floatX](10.0 / n_batches))
def test_shared_variable_modifier_two_params(): weights = numpy.array([-1, 1], dtype=floatX) features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = ContainerDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') cost = ((x * W).sum() - y)**2 cost.name = 'cost' step_rule = Scale(0.001) sgd = GradientDescent(cost=cost, params=[W], step_rule=step_rule) modifier = SharedVariableModifier( step_rule.learning_rate, lambda _, val: numpy.cast[floatX](val * 0.2)) main_loop = MainLoop(model=None, data_stream=dataset.get_default_stream(), algorithm=sgd, extensions=[FinishAfter(after_n_epochs=1), modifier]) main_loop.run() new_value = step_rule.learning_rate.get_value() assert_allclose(new_value, 0.001 * 0.2**n_batches, atol=1e-5)
def test_model_handles_brickless_parameteres(): x = tensor.matrix('x') v = shared_floatx(numpy.zeros((10, 10)), name='V') add_role(v, PARAMETER) y = x.dot(v) model = Model(y) assert list(model.get_parameter_dict().items()) == [('V', v)]
def setup_mainloop(extension): """Set up a simple main loop for progress bar tests. Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ features = [ numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]] ] dataset = IterableDataset(dict(features=features)) W = shared_floatx([0, 0], name='W') x = tensor.vector('features') cost = tensor.sum((x - W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, params=[W], step_rule=Scale(1e-3)) main_loop = MainLoop(model=None, data_stream=dataset.get_example_stream(), algorithm=algorithm, extensions=[FinishAfter(after_n_epochs=1), extension]) return main_loop
def setup_mainloop(extensions): """Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) datastream = DataStream(dataset) W = shared_floatx([0, 0], name='W') add_role(W, PARAMETER) x = tensor.vector('features') cost = tensor.sum((x-W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, parameters=[W], step_rule=Scale(1e-3)) main_loop = MainLoop( model=Model(cost), data_stream=datastream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=1), ] + extensions) return main_loop
def setup_mainloop(extensions): """Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) datastream = DataStream(dataset) W = shared_floatx([0, 0], name='W') add_role(W, PARAMETER) x = tensor.vector('features') cost = tensor.sum((x - W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, parameters=[W], step_rule=Scale(1e-3)) main_loop = MainLoop(model=Model(cost), data_stream=datastream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=1), ] + extensions) return main_loop
def __init__(self, learning_rate=0.002, mu1=0.99, nu2=0.999, epsilon=1e-8, decay_prod=(1.)): self.learning_rate = shared_floatx(learning_rate, "learning_rate") self.mu1 = shared_floatx(mu1, "mu1") self.nu2 = shared_floatx(nu2, "nu2") self.epsilon = shared_floatx(epsilon, "epsilon") self.decay_prod = shared_floatx(decay_prod, "decay_prod") for param in [ self.learning_rate, self.mu1, self.nu2, self.epsilon, self.decay_prod ]: add_role(param, ALGORITHM_HYPERPARAMETER)