Пример #1
0
    def test_zero_optimal(self):
        """ minimizes the kl divergence between q and p
            using batch gradient descent and checks that
            the result is zero"""

        rng = np.random.RandomState([1,2,3])

        dim = self.dim

        num_trials = 3

        mu = rng.randn(dim).astype(floatX)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.p.mu.set_value(mu)
        mu = rng.randn(dim).astype(floatX)
        self.q.mu.set_value(mu)
        self.p.beta.set_value(beta)
        beta = rng.uniform(.1,10.,(dim,)).astype(floatX)
        self.q.beta.set_value(beta)

        kl = kl_divergence(self.q,self.p)

        p = self.p
        q = self.q

        optimizer = BatchGradientDescent(
                    max_iter = 100,
                    line_search_mode = 'exhaustive',
                    verbose = True,
                    objective = kl,
                    conjugate = True,
                    params = [ p.mu, p.beta, q.mu, q.beta ],
                    param_constrainers = [ p.censor_updates,
                        q.censor_updates ])

        #optimizer.verbose = True

        kl = optimizer.minimize()

        if kl < 0.:

            if config.floatX == 'float32':
                neg_tol = 4.8e-7
            else:
                neg_tol = 0.

            if kl < - neg_tol:
                raise AssertionError("KL divergence should "
                    "be non-negative but is "+
                    str(kl))

            warnings.warn("KL divergence is not very numerically stable, evidently")

        tol = 5.4e-5
        if kl > tol:
            print 'kl:',kl
            print 'tol:',tol
        assert kl <= tol
        assert not (kl > tol )
Пример #2
0
    def createGradientFunctions(self):
        #create
        X = T.dmatrices("X")
        mu, logSigma, u, v, f, R = T.dcols("mu", "logSigma", "u", "v", "f",
                                           "R")
        mu = sharedX(np.random.normal(10, 10, (self.dimTheta, 1)), name='mu')
        logSigma = sharedX(np.random.uniform(0, 4, (self.dimTheta, 1)),
                           name='logSigma')
        logLambd = sharedX(np.matrix(np.random.uniform(0, 10)),
                           name='logLambd')
        logLambd = T.patternbroadcast(T.dmatrix("logLambd"), [1, 1])
        negKL = 0.5 * T.sum(1 + 2 * logSigma - mu**2 - T.exp(logSigma)**2)
        theta = mu + T.exp(logSigma) * v
        W = theta
        y = X[:, 0]
        X_sim = X[:, 1:]
        f = (T.dot(X_sim, W) + u).flatten()

        gradvariables = [mu, logSigma, logLambd]

        logLike = T.sum(-(0.5 * np.log(2 * np.pi) + logLambd) - 0.5 *
                        ((y - f) / (T.exp(logLambd)))**2)

        logp = (negKL + logLike) / self.m

        optimizer = -logp

        self.negKL = th.function([mu, logSigma],
                                 negKL,
                                 on_unused_input='ignore')
        self.f = th.function(gradvariables + [X, u, v],
                             f,
                             on_unused_input='ignore')
        self.logLike = th.function(gradvariables + [X, u, v],
                                   logLike,
                                   on_unused_input='ignore')
        derivatives = T.grad(logp, gradvariables)
        derivatives.append(logp)

        self.gradientfunction = th.function(gradvariables + [X, u, v],
                                            derivatives,
                                            on_unused_input='ignore')
        self.lowerboundfunction = th.function(gradvariables + [X, u, v],
                                              logp,
                                              on_unused_input='ignore')

        self.optimizer = BatchGradientDescent(objective=optimizer,
                                              params=gradvariables,
                                              inputs=[X, u, v],
                                              conjugate=True,
                                              max_iter=1)
Пример #3
0
    def test_zero_optimal(self):
        """ minimizes the kl divergence between q and p
            using batch gradient descent and checks that
            the result is zero"""

        rng = np.random.RandomState([1, 2, 3])

        dim = self.dim

        num_trials = 3

        mu = rng.randn(dim).astype(floatX)
        beta = rng.uniform(.1, 10., (dim, )).astype(floatX)
        self.p.mu.set_value(mu)
        mu = rng.randn(dim).astype(floatX)
        self.q.mu.set_value(mu)
        self.p.beta.set_value(beta)
        beta = rng.uniform(.1, 10., (dim, )).astype(floatX)
        self.q.beta.set_value(beta)

        kl = kl_divergence(self.q, self.p)

        p = self.p
        q = self.q

        optimizer = BatchGradientDescent(
            objective=kl,
            params=[p.mu, p.beta, q.mu, q.beta],
            param_constrainers=[p.censor_updates, q.censor_updates])

        #optimizer.verbose = True

        kl = optimizer.minimize()

        if kl < 0.:
            raise AssertionError("KL divergence should "
                                 "be non-negative but is " + str(kl))

        tol = 5.4e-5
        assert kl <= tol
        assert not (kl > tol)
Пример #4
0
    def fit(self, params=None, l1=.0, l2=.0):
        NLL = self.loss_symbolic(self.L, self.y, self.mu, self.R, self.eta,
                                 self.eps)

        if params is None:
            params = [self.eta]

        # Symbolic Theano variables that represent the L1 and L2 regularization terms
        L1, L2 = .0, .0
        for param in params:
            L1 += T.sum(abs(param))
            L2 += T.sum(param**2)

        regularized_NLL = NLL + l1 * L1 + l2 * L2

        minimizer = BatchGradientDescent(objective=regularized_NLL,
                                         params=params,
                                         inputs=[],
                                         verbose=1)

        minimizer.minimize()
Пример #5
0
    def fit(self, params=None, l1=.0, l2=.0):
        """
        Fit the model by minimizing the Leave One Out (LOO) loss using gradient-based optimization.
        """
        loo_loss = self.loss_symbolic(self.L, self.y, self.mu, self.R,
                                      self.eta, self.eps)

        if params is None:
            params = [self.eta]

        # Symbolic Theano variables that represent the L1 and L2 regularization terms
        L1, L2 = .0, .0
        for param in params:
            L1 += T.sum(abs(param))
            L2 += T.sum(param**2)

        regularized_loo_loss = loo_loss + l1 * L1 + l2 * L2

        minimizer = BatchGradientDescent(objective=regularized_loo_loss,
                                         params=params,
                                         inputs=[],
                                         verbose=1)

        minimizer.minimize()
Пример #6
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model : object
            A Python object representing the model to train loosely \
            implementing the interface of models.model.Model.
        dataset : pylearn2.datasets.dataset.Dataset
            Dataset object used to draw training data
        """
        self.model = model

        if self.cost is None:
            self.cost = model.get_default_cost()

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0
                        or batch_size == model.force_batch_size):
                    raise ValueError("batch_size is %d but " +
                                     "model.force_batch_size is %d" %
                                     (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)

        data_specs = self.cost.get_data_specs(model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space,
        # named according to the sources.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = 'BGD_[%s]' % source
            arg = space.make_theano_batch(name=name)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with their data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    **fixed_var_descr.fixed_vars)
        grads, grad_updates = self.cost.get_gradients(
            model, nested_args, **fixed_var_descr.fixed_vars)

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)

        if cost_value is None:
            raise ValueError("BGD is incompatible with " + str(self.cost) +
                             " because it is intractable, but BGD uses the " +
                             "cost function value to do line searches.")

        # obj_prereqs has to be a list of function f called with f(*data),
        # where data is a data tuple coming from the iterator.
        # this function enables capturing "mapping" and "f", while
        # enabling the "*data" syntax
        def capture(f, mapping=mapping):
            new_f = lambda *args: f(mapping.flatten(args, return_tuple=True))
            return new_f

        obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch]

        if self.monitoring_dataset is not None:
            if (self.monitoring_batch_size is None
                    and self.monitoring_batches is None):
                self.monitoring_batch_size = self.batch_size
                self.monitoring_batches = self.batches_per_iter
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.monitoring_batch_size,
                               num_batches=self.monitoring_batches,
                               obj_prereqs=obj_prereqs,
                               cost_monitoring_args=fixed_var_descr.fixed_vars)

        params = model.get_params()

        self.optimizer = BatchGradientDescent(
            objective=cost_value,
            gradients=grads,
            gradient_updates=grad_updates,
            params=params,
            param_constrainers=[model.censor_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=theano_args,
            verbose=self.verbose_optimization,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
            theano_function_mode=self.theano_function_mode,
            init_alpha=self.init_alpha)

        # These monitoring channels keep track of shared variables,
        # which do not need inputs nor data.
        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                name='ave_step_size',
                ipt=None,
                val=self.optimizer.ave_step_size,
                data_specs=(NullSpace(), ''),
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_size',
                ipt=None,
                val=self.optimizer.ave_grad_size,
                data_specs=(NullSpace(), ''),
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_mult',
                ipt=None,
                val=self.optimizer.ave_grad_mult,
                data_specs=(NullSpace(), ''),
                dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True
Пример #7
0
                        max_beta=beta), -1)
        J = nce(model, X, T.concatenate(Y, axis=0))

        accs = []
        for Y_i in Y:
            pos_prob = 1. / (
                1. + T.exp(model.free_energy(X) - model.free_energy(Y_i)))
            acc = (pos_prob > .5).mean()
            accs.append(acc)
        acc = sum(accs) / float(len(accs))

        print '\tinit accuracy ', function([], acc)()

        #Minimize the objective function with batch gradient descent
        minimizer = BatchGradientDescent(
            objective=J,
            params=model.get_params(),
            param_constrainers=[model.censor_updates])

        print '\tinit obj:', minimizer.obj()
        #minimizer.verbose = True
        minimizer.minimize()
        print '\tfinal obj:', minimizer.obj()

        recovered_beta = model.beta.get_value()
        recovered_mu = model.mu.get_value()

        print '\trecovered beta:', recovered_beta
        print '\trecovered mu:', recovered_mu

        kl = kl_divergence(true, model)
        kl = function([], kl)()
Пример #8
0
def test_batch_gradient_descent():
    """ Verify that batch gradient descent works by checking that
        it minimizes a quadratic function f(x) = x^T A x + b^T x + c
        correctly for several sampled values of A, b, and c.
        The ground truth minimizer is x = np.linalg.solve(A,-b)"""

    n = 3

    A = T.matrix(name='A')
    b = T.vector(name='b')
    c = T.scalar(name='c')

    x = sharedX(np.zeros((n, )), name='x')

    half = np.cast[config.floatX](0.5)

    obj = half * T.dot(T.dot(x, A), x) + T.dot(b, x) + c

    minimizer = BatchGradientDescent(objective=obj,
                                     params=[x],
                                     inputs=[A, b, c])

    num_samples = 3

    rng = np.random.RandomState([1, 2, 3])

    for i in xrange(num_samples):
        A = np.cast[config.floatX](rng.randn(1.5 * n, n))
        A = np.cast[config.floatX](np.dot(A.T, A))
        A += np.cast[config.floatX](np.identity(n) * .02)
        b = np.cast[config.floatX](rng.randn(n))
        c = np.cast[config.floatX](rng.randn())
        x.set_value(np.cast[config.floatX](rng.randn(n)))

        analytical_x = np.linalg.solve(A, -b)

        actual_obj = minimizer.minimize(A, b, c)
        actual_x = x.get_value()

        #Check that the value returned by the minimize method
        #is the objective function value at the parameters
        #chosen by the minimize method
        cur_obj = minimizer.obj(A, b, c)
        assert np.allclose(actual_obj, cur_obj)

        x.set_value(analytical_x)
        analytical_obj = minimizer.obj(A, b, c)

        #make sure the objective function is accurate to first 4 digits
        condition1 = not np.allclose(analytical_obj, actual_obj)
        condition2 = np.abs(analytical_obj -
                            actual_obj) >= 1e-4 * np.abs(analytical_obj)

        if (config.floatX == 'float64' and condition1) \
                or (config.floatX == 'float32' and condition2):
            print 'objective function value came out wrong on sample ', i
            print 'analytical obj', analytical_obj
            print 'actual obj', actual_obj
            """
                The following section of code was used to verify that numerical
                error can make the objective function look non-convex

                print 'Checking for numerically induced non-convex behavior'
                def f(x):
                    return 0.5 * np.dot(x,np.dot(A,x)) + np.dot(b,x) + c

                x.set_value(actual_x)
                minimizer._compute_grad(A,b,c)
                minimizer._normalize_grad()
                d = minimizer.param_to_grad_shared[x].get_value()

                x = actual_x.copy()
                prev = f(x)
                print prev
                step_size = 1e-4
                x += step_size * d
                cur = f(x)
                print cur
                cur_sgn = np.sign(cur-prev)
                flip_cnt = 0
                for i in xrange(10000):
                    x += step_size * d
                    prev = cur
                    cur = f(x)
                    print cur
                    prev_sgn = cur_sgn
                    cur_sgn = np.sign(cur-prev)
                    if cur_sgn != prev_sgn:
                        print 'flip'
                        flip_cnt += 1
                        if flip_cnt > 1:
                            print "Non-convex!"

                            from matplotlib import pyplot as plt
                            y = []

                            x = actual_x.copy()
                            for j in xrange(10000):
                                y.append(f(x))
                                x += step_size * d

                            plt.plot(y)
                            plt.show()

                            assert False

                print 'None found'
                """

            #print 'actual x',actual_x
            #print 'A:'
            #print A
            #print 'b:'
            #print b
            #print 'c:'
            #print c
            x.set_value(actual_x)
            minimizer._compute_grad(A, b, c)
            x_grad = minimizer.param_to_grad_shared[x]
            actual_grad = x_grad.get_value()
            correct_grad = 0.5 * np.dot(A, x.get_value()) + 0.5 * np.dot(
                A.T, x.get_value()) + b
            if not np.allclose(actual_grad, correct_grad):
                print 'gradient was wrong at convergence point'
                print 'actual grad: '
                print actual_grad
                print 'correct grad: '
                print correct_grad
                print 'max difference: ', np.abs(actual_grad -
                                                 correct_grad).max()
                assert False

            minimizer._normalize_grad()
            d = minimizer.param_to_grad_shared[x].get_value()
            step_len = ( np.dot(b,d) + 0.5 * np.dot(d,np.dot(A,actual_x)) \
                    + 0.5 * np.dot(actual_x,np.dot(A,d)) ) / np.dot(d, np.dot(A,d))

            g = np.dot(A, actual_x) + b
            deriv = np.dot(g, d)

            print 'directional deriv at actual', deriv
            print 'optimal step_len', step_len
            optimal_x = actual_x - d * step_len
            g = np.dot(A, optimal_x) + b
            deriv = np.dot(g, d)

            print 'directional deriv at optimal: ', deriv
            x.set_value(optimal_x)
            print 'obj at optimal: ', minimizer.obj(A, b, c)

            print 'eigenvalue range:'
            val, vec = np.linalg.eig(A)
            print(val.min(), val.max())
            print 'condition number: ', (val.max() / val.min())
            assert False
Пример #9
0
p, h = state

p_shape = layer.get_output_space().shape
i = p_shape[0] / 2
j = p_shape[1] / 2

act = p[0,filter_idx,i,j]

obj = - act + norm_penalty * T.square(X).sum()

assert obj.ndim == 0

optimizer = BatchGradientDescent(objective = obj,
        params = [X],
        inputs = None,
        param_constrainers = None,
        max_iter = 1000,
        verbose = True,
        tol = None,
        init_alpha = (.001, .005, .01, .05, .1))

optimizer.minimize()

img = X.get_value()[0,:,:,:]

print 'max mag: ',np.abs(img).max()
print 'norm: ',np.square(img).sum()
print 'min: ',img.min()
print 'max: ',img.max()

img /= np.abs(img).max()
Пример #10
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        self.monitor = Monitor.get_monitor(model)
        X = T.matrix()
        Y = T.matrix()
        dnce = DNCE( self.noise)
        if self.monitoring_dataset is not None:
            if not self.monitoring_dataset.has_targets():
                Y = None
            self.monitor.set_dataset(dataset=self.monitoring_dataset,
                                mode="sequential",
                                batch_size=self.batch_size,
                                num_batches=self.monitoring_batches)
            X.tag.test_value = self.monitoring_dataset.get_batch_design(2)
            channels = model.get_monitoring_channels(X,Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))

            dnce.noise_per_clean = self.noise_per_clean
            obj = dnce(model,X)
            dnce.noise_per_clean = None
            self.monitor.add_channel('DNCE',ipt=X,val=obj)

            for name in channels:
                J = channels[name]
                if isinstance(J, tuple):
                    assert len(J) == 2
                    J, prereqs = J
                else:
                    prereqs = None

                if Y is not None:
                    ipt = (X,Y)
                else:
                    ipt = X

                self.monitor.add_channel(name=name,
                                         ipt=ipt,
                                         val=J,
                                         prereqs=prereqs)

        X = sharedX( dataset.get_batch_design(1), 'X')
        Y = []
        updates = {}
        for i in xrange(self.noise_per_clean):
            Y_i = sharedX( X.get_value().copy() )
            updates[Y_i] = self.noise.random_design_matrix(X)
            Y.append(Y_i)
        self.update_noise = function([], updates = updates)


        obj = dnce(model,X,Y)

        self.optimizer = BatchGradientDescent(
                            objective = obj,
                            params = model.get_params(),
                            param_constrainers = [ model.censor_updates ],
                            max_iter = 5)
        self.X = X
        self.Y = Y


        self.first = True
        self.bSetup = True
Пример #11
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.cost is None:
            self.cost = model.get_default_cost()

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0 or batch_size ==
                        model.force_batch_size):
                    raise ValueError("batch_size is %d but model.force_batch_size is %d" %
                            (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)
        X = self.model.get_input_space().make_theano_batch()
        X.name = 'BGD_X'
        self.topo = X.ndim != 2
        if self.topo:
            assert self.model.get_input_space().axes == ('b', 0, 1, 'c')
        Y = T.matrix()
        Y.name = 'BGD_Y'
        if config.compute_test_value != 'off':
            X.tag.test_value = self.model.get_input_space().get_origin_batch(self.batch_size).astype(X.dtype)
            Y_batch = self.model.get_output_space().get_origin_batch(self.batch_size).astype(Y.dtype)
            assert Y_batch.ndim == 2
            for i in xrange(Y_batch.shape[0]):
                Y_batch[i, i % Y_batch.shape[1]] = 1
            Y.tag.test_value = Y_batch

        fixed_var_descr = self.cost.get_fixed_var_descr(model, X, Y)
        self.on_load_batch = fixed_var_descr.on_load_batch

        if not self.cost.supervised:
            Y = None

        if self.cost.supervised:
            obj = self.cost(model, X, Y, ** fixed_var_descr.fixed_vars)
            grads, grad_updates = self.cost.get_gradients(model, X, Y, ** fixed_var_descr.fixed_vars)
            ipt = (X,Y)
        else:
            obj = self.cost(model, X, ** fixed_var_descr.fixed_vars)
            grads, grad_updates = self.cost.get_gradients(model, X, ** fixed_var_descr.fixed_vars)
            ipt = X
            Y = None

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)


        if obj is None:
            raise ValueError("BGD is incompatible with "+str(self.cost)+" because "
                    " it is intractable, but BGD uses the cost function value to do "
                    " line searches.")

        # TODO: replace the following if block with a call to monitor.setup (it does the same thing;
        # this will reduce code duplication)
        # may need to still manually add some BGD-specific channels like ave_step_size here
        if self.monitoring_dataset is not None:
            if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]):
                Y = None

            channels = model.get_monitoring_channels(X,Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, X, Y, ** fixed_var_descr.fixed_vars))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                    mode="sequential",
                                    batch_size=self.batch_size,
                                    num_batches=self.monitoring_batches)

                # The monitor compiles all channels for the same dataset into one function, and
                # runs all prereqs before calling the function. So we only need to register the
                # on_load_batch prereq once per monitoring dataset.
                self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=obj,
                        dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X,Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name= prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset = monitoring_dataset,
                                             prereqs=prereqs)

        if self.cost.supervised:
            ipts = [X, Y]
        else:
            ipts = [X]

        params = model.get_params()

        self.optimizer = BatchGradientDescent(
                            objective = obj,
                            gradients = grads,
                            gradient_updates = grad_updates,
                            params = params,
                            param_constrainers = [ model.censor_updates ],
                            lr_scalers = model.get_lr_scalers(),
                            inputs = ipts,
                            verbose = self.verbose_optimization,
                            max_iter = self.updates_per_batch,
                            reset_alpha = self.reset_alpha,
                            conjugate = self.conjugate,
                            reset_conjugate = self.reset_conjugate,
                            min_init_alpha = self.min_init_alpha,
                            line_search_mode = self.line_search_mode,
                            theano_function_mode=self.theano_function_mode,
                            init_alpha=self.init_alpha)

        if self.monitoring_dataset is not None:
            self.monitor.add_channel(name='ave_step_size',
                    ipt=ipt, val = self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(name='ave_grad_size',
                    ipt=ipt, val = self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(name='ave_grad_mult',
                    ipt=ipt, val = self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0])


        self.first = True
        self.bSetup = True
Пример #12
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.set_batch_size:
            model.set_batch_size(self.batch_size)

        if self.batch_size is None:
            self.batch_size = model.force_batch_size

        model.cost = self.cost
        model.mask_gen = self.mask_gen

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)
        prereq = self.get_setup_batch_object()
        #We want to use big batches. We need to make several theano calls on each
        #batch. To avoid paying the GPU latency every time, we use a shared variable
        #but the shared variable needs to stay allocated during the time that the
        #monitor is working, and we don't want the monitor to increase the memory
        #overhead. So we make the monitor work off of the same shared variable
        space = model.get_input_space()
        X = sharedX(space.get_origin_batch(model.batch_size), 'BGD_X')
        self.space = space
        rng = np.random.RandomState([2012, 7, 20])
        test_mask = space.get_origin_batch(model.batch_size)
        test_mask = rng.randint(0, 2, test_mask.shape)
        if hasattr(self.mask_gen,
                   'sync_channels') and self.mask_gen.sync_channels:
            if test_mask.ndim != 4:
                raise NotImplementedError()
            test_mask = test_mask[:, :, :, 0]
            assert test_mask.ndim == 3
        drop_mask = sharedX(np.cast[X.dtype](test_mask), name='drop_mask')
        self.drop_mask = drop_mask
        assert drop_mask.ndim == test_mask.ndim

        Y = None
        drop_mask_Y = None
        if self.cost.supervised:
            Y = sharedX(
                model.get_output_space().get_origin_batch(model.batch_size),
                'BGD_Y')
            self.Y = Y
            test_mask_Y = rng.randint(0, 2, (model.batch_size, ))
            drop_mask_Y = sharedX(np.cast[Y.dtype](test_mask_Y),
                                  name='drop_mask_Y')
            self.drop_mask_Y = drop_mask_Y
            dmx, dmy = self.mask_gen(X, Y)
            updates = OrderedDict([ (drop_mask, dmx),\
                    (drop_mask_Y, dmy)] )
        else:
            updates = OrderedDict([(drop_mask, self.mask_gen(X))])

        obj = self.cost(model,
                        X,
                        Y,
                        drop_mask=drop_mask,
                        drop_mask_Y=drop_mask_Y)
        gradients, gradient_updates = self.cost.get_gradients(
            model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y)

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            theano_rng = MRG_RandomStreams(2012 + 11 + 20)
            for elem in flatten([
                    model.inference_procedure.V_dropout,
                    model.inference_procedure.H_dropout
            ]):
                updates[elem] = theano_rng.binomial(
                    p=include_prob, size=elem.shape, dtype=elem.dtype,
                    n=1) / include_prob
        self.update_mask = function([], updates=updates)

        if self.monitoring_dataset is not None:
            if not any([
                    dataset.has_targets()
                    for dataset in self.monitoring_dataset.values()
            ]):
                Y = None
            assert X.name is not None
            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            assert X.name is not None
            wtf = self.cost.get_monitoring_channels(model,
                                                    X=X,
                                                    Y=Y,
                                                    drop_mask=drop_mask,
                                                    drop_mask_Y=drop_mask_Y)
            for key in wtf:
                channels[key] = wtf[key]

            for dataset_name in self.monitoring_dataset:

                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'

                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                         mode="sequential",
                                         batch_size=self.batch_size,
                                         num_batches=self.monitoring_batches)
                #we only need to put the prereq in once to make sure it gets run
                #adding it more times shouldn't hurt, but be careful
                #each time you say "self.setup_batch" you get a new object with a
                #different id, and if you install n of those the prereq will run n
                #times. It won't cause any wrong results, just a big slowdown
                warnings.warn(
                    "This is weird-- ipt=(X,Y)=tell the monitor to replace X, Y with the givens dict, "
                    " but you don't actually want them to be replaced.")
                ipt = X
                if Y is not None:
                    ipt = [X, Y]
                self.monitor.add_channel(prefix + 'objective',
                                         ipt=ipt,
                                         val=obj,
                                         dataset=monitoring_dataset,
                                         prereqs=[prereq])

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = []

                    prereqs = list(prereqs)
                    prereqs.append(prereq)

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name=prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset=monitoring_dataset,
                                             prereqs=prereqs)

        self.accumulate = self.combine_batches > 1
        if self.accumulate:
            self.inputs = [
                elem for elem in [X, Y, drop_mask, drop_mask_Y]
                if elem is not None
            ]
        else:
            self.inputs = None

        self.optimizer = BatchGradientDescent(
            objective=obj,
            inputs=self.inputs,
            verbose=1,
            gradients=gradients,
            gradient_updates=gradient_updates,
            params=model.get_params(),
            lr_scalers=model.get_lr_scalers(),
            param_constrainers=[model.censor_updates],
            max_iter=self.max_iter,
            tol=3e-7,
            init_alpha=self.init_alpha,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
            accumulate=self.accumulate,
            theano_function_mode=self.theano_function_mode)
        self.X = X

        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                name='ave_step_size',
                ipt=ipt,
                val=self.optimizer.ave_step_size,
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_size',
                ipt=ipt,
                val=self.optimizer.ave_grad_size,
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_mult',
                ipt=ipt,
                val=self.optimizer.ave_grad_mult,
                dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True
Пример #13
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model: a Python object representing the model to train loosely
        implementing the interface of models.model.Model.

        dataset: a pylearn2.datasets.dataset.Dataset object used to draw
        training data
        """
        self.model = model

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0
                        or batch_size == model.force_batch_size):
                    raise ValueError(
                        "batch_size is %d but model.force_batch_size is %d" %
                        (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        X = self.model.get_input_space().make_theano_batch()
        self.topo = X.ndim != 2
        Y = T.matrix()

        if self.cost.supervised:
            obj = self.cost(model, X, Y)
            grads, grad_updates = self.cost.get_gradients(model, X, Y)
            ipt = (X, Y)
        else:
            obj = self.cost(model, X)
            grads, grad_updates = self.cost.get_gradients(model, X)
            ipt = X
        if obj is None:
            raise ValueError(
                "BGD is incompatible with " + str(self.cost) + " because "
                " it is intractable, but BGD uses the cost function value to do "
                " line searches.")

        if self.monitoring_dataset is not None:
            if not any([
                    dataset.has_targets()
                    for dataset in self.monitoring_dataset.values()
            ]):
                Y = None

            channels = model.get_monitoring_channels(X, Y)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, X, Y))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                         mode="sequential",
                                         batch_size=self.batch_size,
                                         num_batches=self.monitoring_batches)

                self.monitor.add_channel(prefix + 'objective',
                                         ipt=ipt,
                                         val=obj,
                                         dataset=monitoring_dataset)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    if Y is not None:
                        ipt = (X, Y)
                    else:
                        ipt = X

                    self.monitor.add_channel(name=prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             dataset=monitoring_dataset,
                                             prereqs=prereqs)

        if ipt is X:
            ipts = [X]
        else:
            ipts = ipt

        self.optimizer = BatchGradientDescent(
            objective=obj,
            gradients=grads,
            gradient_updates=grad_updates,
            params=model.get_params(),
            param_constrainers=[model.censor_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=ipts,
            verbose=True,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode)

        self.first = True
        self.bSetup = True
Пример #14
0
outputs = model.fprop(normed, return_all=True)

output = outputs[layer_idx]
neuron = output[tuple(idxs)]

from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent

bgd = BatchGradientDescent(objective=-neuron,
        params=[X],
        inputs=None,
        max_iter=100,
        lr_scalers=None,
        verbose=3,
        tol=None,
        init_alpha=None,
        min_init_alpha=1e-3,
        reset_alpha=True,
        conjugate=True,
        gradients=None,
        gradient_updates=None,
        accumulate=False,
        theano_function_mode=None,
        param_constrainers=None)

bgd.minimize()


X = normed.eval()[:,:,:,0].transpose(1,2,0)
import numpy as np
X /= np.abs(X).max()
print (X.min(), X.max())