Пример #1
0
def test_vector_to_conv_c01b_invertible():

    """
    Tests that the format_as methods between Conv2DSpace
    and VectorSpace are invertible for the ('c', 0, 1, 'b')
    axis format.
    """

    rng = np.random.RandomState([2013, 5, 1])

    batch_size = 3
    rows = 4
    cols = 5
    channels = 2

    conv = Conv2DSpace([rows, cols], channels = channels, axes = ('c', 0, 1, 'b'))
    vec = VectorSpace(conv.get_total_dimension())

    X = conv.make_batch_theano()
    Y = conv.format_as(X, vec)
    Z = vec.format_as(Y, conv)

    A = vec.make_batch_theano()
    B = vec.format_as(A, conv)
    C = conv.format_as(B, vec)

    f = function([X, A], [Z, C])

    X = rng.randn(*(conv.get_origin_batch(batch_size).shape)).astype(X.dtype)
    A = rng.randn(*(vec.get_origin_batch(batch_size).shape)).astype(A.dtype)

    Z, C = f(X,A)

    np.testing.assert_allclose(Z, X)
    np.testing.assert_allclose(C, A)
Пример #2
0
def test_vector_to_conv_c01b_invertible():
    """
    Tests that the format_as methods between Conv2DSpace
    and VectorSpace are invertible for the ('c', 0, 1, 'b')
    axis format.
    """

    rng = np.random.RandomState([2013, 5, 1])

    batch_size = 3
    rows = 4
    cols = 5
    channels = 2

    conv = Conv2DSpace([rows, cols], channels=channels, axes=('c', 0, 1, 'b'))
    vec = VectorSpace(conv.get_total_dimension())

    X = conv.make_batch_theano()
    Y = conv.format_as(X, vec)
    Z = vec.format_as(Y, conv)

    A = vec.make_batch_theano()
    B = vec.format_as(A, conv)
    C = conv.format_as(B, vec)

    f = function([X, A], [Z, C])

    X = rng.randn(*(conv.get_origin_batch(batch_size).shape)).astype(X.dtype)
    A = rng.randn(*(vec.get_origin_batch(batch_size).shape)).astype(A.dtype)

    Z, C = f(X, A)

    np.testing.assert_allclose(Z, X)
    np.testing.assert_allclose(C, A)
Пример #3
0
    def inv_prop(self, state_above):
        if not isinstance(state_above, tuple):
            expected_space = VectorSpace(self.output_space.get_total_dimension())
            state_above = expected_space.format_as(state_above, self.output_space)

        self.output_space.validate(state_above)
        return tuple(layer.inv_prop(state) for layer,state in safe_zip(self.layers, state_above))
Пример #4
0
def simulate(inputs, model):
    space = VectorSpace(inputs.shape[1])
    X = space.get_theano_batch()
    Y = model.fprop(space.format_as(X, model.get_input_space()))
    f = theano.function([X], Y)
    result = []
    for x in xrange(0, len(inputs), 100):
      result.extend(f(inputs[x:x + 100]))
    return result
Пример #5
0
    def inv_prop(self, state_above):
        if not isinstance(state_above, tuple):
            expected_space = VectorSpace(
                self.output_space.get_total_dimension())
            state_above = expected_space.format_as(state_above,
                                                   self.output_space)

        self.output_space.validate(state_above)
        return tuple(
            layer.inv_prop(state)
            for layer, state in safe_zip(self.layers, state_above))
Пример #6
0
class VectorSpaceConverter(mlp.Layer):
    def __init__(self, layer_name):
        self.layer_name = layer_name
        self._params = []

    def set_input_space(self, space):
        self.input_space = space
        self.output_space = VectorSpace(space.get_total_dimension())

    def fprop(self, state_below):
        return self.input_space.format_as(state_below, self.output_space)

    def inv_prop(self, state_above):
        return self.output_space.format_as(state_above, self.input_space)

    def get_weight_decay(self, coeff):
        return 0.0

    def get_l1_weight_decay(self, coeff):
        return 0.0
Пример #7
0
class VectorSpaceConverter(mlp.Layer):
    def __init__(self, layer_name):
        self.layer_name = layer_name
        self._params = []

    def set_input_space(self, space):
        self.input_space = space
        self.output_space = VectorSpace(space.get_total_dimension())

    def fprop(self, state_below):
        return self.input_space.format_as(state_below, self.output_space)

    def inv_prop(self, state_above):
        return self.output_space.format_as(state_above, self.input_space)

    def get_weight_decay(self, coeff):
        return 0.0

    def get_l1_weight_decay(self, coeff):
        return 0.0
Пример #8
0
class Softmax(HiddenLayer):
    def __init__(self,
                 n_classes,
                 layer_name,
                 irange=None,
                 sparse_init=None,
                 W_lr_scale=None):

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        self.__dict__.update(locals())
        del self.self

        assert isinstance(n_classes, int)

        self.output_space = VectorSpace(n_classes)
        self.b = sharedX(np.zeros((n_classes, )), name='softmax_b')

    def get_lr_scalers(self):

        rval = {}

        # Patch old pickle files
        if not hasattr(self, 'W_lr_scale'):
            self.W_lr_scale = None

        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        return rval

    def get_total_state_space(self):
        return self.output_space

    def get_monitoring_channels_from_state(self, state):

        mx = state.max(axis=1)

        return {
            'mean_max_class': mx.mean(),
            'max_max_class': mx.max(),
            'min_max_class': mx.min()
        }

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got " + str(space) + " of type " +
                            str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        self.desired_space = VectorSpace(self.input_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.dbm.rng

        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange, self.irange,
                            (self.input_dim, self.n_classes))
        else:
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.n_classes))
            for i in xrange(self.n_classes):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0.:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        self.W = sharedX(W, 'softmax_W')

        self._params = [self.b, self.W]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes,
                                         ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):
        self.W.set_value(weights)

    def set_biases(self, biases):
        self.b.set_value(biases)

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def sample(self,
               state_below=None,
               state_above=None,
               layer_above=None,
               theano_rng=None):

        if state_above is not None:
            # If you implement this case, also add a unit test for it.
            # Or at least add a warning that it is not tested.
            raise NotImplementedError()

        if theano_rng is None:
            raise ValueError(
                "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list."
            )

        self.input_space.validate(state_below)

        # patch old pickle files
        if not hasattr(self, 'needs_reformat'):
            self.needs_reformat = self.needs_reshape
            del self.needs_reshape

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below,
                                                     self.desired_space)

        self.desired_space.validate(state_below)

        z = T.dot(state_below, self.W) + self.b
        h_exp = T.nnet.softmax(z)
        h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype)

        return h_sample

    def mf_update(self,
                  state_below,
                  state_above=None,
                  layer_above=None,
                  double_weights=False,
                  iter_name=None):
        if state_above is not None:
            raise NotImplementedError()

        if double_weights:
            raise NotImplementedError()

        self.input_space.validate(state_below)

        # patch old pickle files
        if not hasattr(self, 'needs_reformat'):
            self.needs_reformat = self.needs_reshape
            del self.needs_reshape

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below,
                                                     self.desired_space)

        self.desired_space.validate(state_below)
        """
        from pylearn2.utils import serial
        X = serial.load('/u/goodfeli/galatea/dbm/inpaint/expdir/cifar10_N3_interm_2_features.pkl')
        state_below = Verify(X,'features')(state_below)
        """

        assert self.W.ndim == 2
        assert state_below.ndim == 2

        b = self.b

        Z = T.dot(state_below, self.W) + b

        #Z = Print('Z')(Z)

        rval = T.nnet.softmax(Z)

        return rval

    def downward_message(self, downward_state):

        rval = T.dot(downward_state, self.W.T)

        rval = self.desired_space.format_as(rval, self.input_space)

        return rval

    def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale):
        """
            scale is because the visible layer also goes into the
            cost. it uses the mean over units and examples, so that
            the scale of the cost doesn't change too much with batch
            size or example size.
            we need to multiply this cost by scale to make sure that
            it is put on the same scale as the reconstruction cost
            for the visible units. ie, scale should be 1/nvis
        """

        Y_hat = Y_hat_unmasked
        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        z, = owner.inputs
        assert z.ndim == 2

        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.exp(z).sum(axis=1).dimshuffle(0, 'x')
        # we use sum and not mean because this is really one variable per row
        log_prob_of = (Y * log_prob).sum(axis=1)
        masked = log_prob_of * drop_mask_Y
        assert masked.ndim == 1

        rval = masked.mean() * scale

        return -rval

    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.
        """

        t1 = time.time()

        empty_input = self.output_space.get_origin_batch(num_examples)
        h_state = sharedX(empty_input)

        default_z = T.zeros_like(h_state) + self.b

        theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16))

        h_exp = T.nnet.softmax(default_z)

        h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype)

        p_state = sharedX(self.output_space.get_origin_batch(num_examples))

        t2 = time.time()

        f = function([], updates={h_state: h_sample})

        t3 = time.time()

        f()

        t4 = time.time()

        print str(self) + '.make_state took', t4 - t1
        print '\tcompose time:', t2 - t1
        print '\tcompile time:', t3 - t2
        print '\texecute time:', t4 - t3

        h_state.name = 'softmax_sample_shared'

        return h_state

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float)
        return coeff * T.sqr(self.W).sum()

    def expected_energy_term(self, state, average, state_below, average_below):

        self.input_space.validate(state_below)
        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below,
                                                     self.desired_space)
        self.desired_space.validate(state_below)

        # Energy function is linear so it doesn't matter if we're averaging or not
        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
        # and d is the downward state of this layer

        bias_term = T.dot(state, self.b)
        weights_term = (T.dot(state_below, self.W) * state).sum(axis=1)

        rval = -bias_term - weights_term

        assert rval.ndim == 1

        return rval
Пример #9
0
class ConditionalGenerator(Generator):
    def __init__(self, mlp, input_condition_space, condition_distribution, noise_dim=100, *args, **kwargs):
        super(ConditionalGenerator, self).__init__(mlp, *args, **kwargs)

        self.noise_dim = noise_dim
        self.noise_space = VectorSpace(dim=self.noise_dim)

        self.condition_space = input_condition_space
        self.condition_distribution = condition_distribution

        self.input_space = CompositeSpace([self.noise_space, self.condition_space])
        self.mlp.set_input_space(self.input_space)

    def sample_and_noise(
        self, conditional_data, default_input_include_prob=1.0, default_input_scale=1.0, all_g_layers=False
    ):
        """
        Retrieve a sample (and the noise used to generate the sample)
        conditioned on some input data.

        Parameters
        ----------
        conditional_data: member of self.condition_space
            A minibatch of conditional data to feedforward.

        default_input_include_prob: float
            WRITEME

        default_input_scale: float
            WRITEME

        all_g_layers: boolean
            If true, return all generator layers in `other_layers` slot
            of this method's return value. (Otherwise returns `None` in
            this slot.)

        Returns
        -------
        net_output: 3-tuple
            Tuple of the form `(sample, noise, other_layers)`.
        """

        if isinstance(conditional_data, int):
            conditional_data = self.condition_distribution.sample(conditional_data)

        num_samples = conditional_data.shape[0]

        noise = self.get_noise((num_samples, self.noise_dim))
        # TODO necessary?
        formatted_noise = self.noise_space.format_as(noise, self.noise_space)

        # Build inputs: concatenate noise with conditional data
        inputs = (formatted_noise, conditional_data)

        # Feedforward
        # if all_g_layers:
        #     rval = self.mlp.dropout_fprop(inputs, default_input_include_prob=default_input_include_prob,
        #                                   default_input_scale=default_input_scale, return_all=all_g_layers)
        #     other_layers, rval = rval[:-1], rval[-1]
        # else:
        rval = self.mlp.dropout_fprop(
            inputs, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale
        )
        # other_layers = None

        return rval, formatted_noise, conditional_data, None  # , other_layers

    def sample(self, conditional_data, **kwargs):
        sample, _, _, _ = self.sample_and_noise(conditional_data, **kwargs)
        return sample

    def get_monitoring_channels(self, data):
        if data is None:
            m = 100
            conditional_data = self.condition_distribution.sample(m)
        else:
            _, conditional_data = data
            m = conditional_data.shape[0]

        noise = self.get_noise((m, self.noise_dim))
        rval = OrderedDict()

        sampled_data = (noise, conditional_data)
        try:
            rval.update(self.mlp.get_monitoring_channels((sampled_data, None)))
        except Exception:
            warnings.warn("something went wrong with generator.mlp's monitoring channels")

        if self.monitor_ll:
            rval["ll"] = T.cast(self.ll(data, self.ll_n_samples, self.ll_sigma), theano.config.floatX).mean()
            rval["nll"] = -rval["ll"]
        return rval

    def ll(self, data, n_samples, sigma):
        real_data, conditional_data = data
        sampled_data = self.sample(conditional_data)

        output_space = self.mlp.get_output_space()
        if "Conv2D" in str(output_space):
            samples = output_space.convert(sampled_data, output_space.axes, ("b", 0, 1, "c"))
            samples = samples.flatten(2)
            data = output_space.convert(real_data, output_space.axes, ("b", 0, 1, "c"))
            data = data.flatten(2)
        parzen = theano_parzen(data, samples, sigma)
        return parzen
Пример #10
0
class Softmax(Layer):

    def __init__(self, n_classes, layer_name, irange = None,
            istdev = None,
                 sparse_init = None, W_lr_scale = None,
                 b_lr_scale = None, max_row_norm = None):
        """
        """

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        self.__dict__.update(locals())
        del self.self

        assert isinstance(n_classes, int)

        self.output_space = VectorSpace(n_classes)
        self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b')

    def get_lr_scalers(self):

        rval = OrderedDict()

        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        if not hasattr(self, 'b_lr_scale'):
            self.b_lr_scale = None

        if self.b_lr_scale is not None:
            assert isinstance(self.b_lr_scale, float)
            rval[self.b] = self.b_lr_scale

        return rval

    def get_monitoring_channels_from_state(self, state, target=None):

        mx = state.max(axis=1)

        rval =  OrderedDict([
                ('mean_max_class' , mx.mean()),
                ('max_max_class' , mx.max()),
                ('min_max_class' , mx.min())
        ])

        if target is not None:
            y_hat = T.argmax(state, axis=1)
            y = T.argmax(target, axis=1)
            misclass = T.neq(y, y_hat).mean()
            misclass = T.cast(misclass, config.floatX)
            rval['misclass'] = misclass

        return rval

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        self.desired_space = VectorSpace(self.input_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.irange is not None:
            assert self.istdev is None
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes))
        elif self.istdev is not None:
            assert self.sparse_init is None
            W = rng.randn(self.input_dim, self.n_classes) * self.istdev
        else:
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.n_classes))
            for i in xrange(self.n_classes):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0.:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        self.W = sharedX(W,  'softmax_W' )

        self._params = [ self.b, self.W ]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):
        self.W.set_value(weights)

    def set_biases(self, biases):
        self.b.set_value(biases)

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def fprop(self, state_below):

        self.input_space.validate(state_below)

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        for value in get_debug_values(state_below):
            if value.shape[0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))

        self.desired_space.validate(state_below)

        assert self.W.ndim == 2
        assert state_below.ndim == 2

        b = self.b

        Z = T.dot(state_below, self.W) + b

        rval = T.nnet.softmax(Z)

        for value in get_debug_values(rval):
            assert value.shape[0] == self.mlp.batch_size

        return rval

    def cost(self, Y, Y_hat):
        """
        Y must be one-hot binary. Y_hat is a softmax estimate.
        of Y. Returns negative log probability of Y under the Y_hat
        distribution.
        """

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        z ,= owner.inputs
        assert z.ndim == 2

        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x'))
        # we use sum and not mean because this is really one variable per row
        log_prob_of = (Y * log_prob).sum(axis=1)
        assert log_prob_of.ndim == 1

        rval = log_prob_of.mean()

        return - rval

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        return coeff * T.sqr(self.W).sum()

    def censor_updates(self, updates):
        if self.max_row_norm is not None:
            W = self.W
            if W in updates:
                updated_W = updates[W]
                row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1))
                desired_norms = T.clip(row_norms, 0, self.max_row_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + row_norms)).dimshuffle(0, 'x')
class ConditionalGenerator(Generator):
    def __init__(self,
                 mlp,
                 input_condition_space,
                 condition_distribution,
                 noise_dim=100,
                 *args,
                 **kwargs):
        super(ConditionalGenerator, self).__init__(mlp, *args, **kwargs)

        self.noise_dim = noise_dim
        self.noise_space = VectorSpace(dim=self.noise_dim)

        self.condition_space = input_condition_space
        self.condition_distribution = condition_distribution

        self.input_space = CompositeSpace(
            [self.noise_space, self.condition_space])
        self.mlp.set_input_space(self.input_space)

    def sample_and_noise(self,
                         conditional_data,
                         default_input_include_prob=1.,
                         default_input_scale=1.,
                         all_g_layers=False):
        """
        Retrieve a sample (and the noise used to generate the sample)
        conditioned on some input data.

        Parameters
        ----------
        conditional_data: member of self.condition_space
            A minibatch of conditional data to feedforward.

        default_input_include_prob: float
            WRITEME

        default_input_scale: float
            WRITEME

        all_g_layers: boolean
            If true, return all generator layers in `other_layers` slot
            of this method's return value. (Otherwise returns `None` in
            this slot.)

        Returns
        -------
        net_output: 3-tuple
            Tuple of the form `(sample, noise, other_layers)`.
        """

        if isinstance(conditional_data, int):
            conditional_data = self.condition_distribution.sample(
                conditional_data)

        num_samples = conditional_data.shape[0]

        noise = self.get_noise((num_samples, self.noise_dim))
        # TODO necessary?
        formatted_noise = self.noise_space.format_as(noise, self.noise_space)

        # Build inputs: concatenate noise with conditional data
        inputs = (formatted_noise, conditional_data)

        # Feedforward
        # if all_g_layers:
        #     rval = self.mlp.dropout_fprop(inputs, default_input_include_prob=default_input_include_prob,
        #                                   default_input_scale=default_input_scale, return_all=all_g_layers)
        #     other_layers, rval = rval[:-1], rval[-1]
        # else:
        rval = self.mlp.dropout_fprop(
            inputs,
            default_input_include_prob=default_input_include_prob,
            default_input_scale=default_input_scale)
        # other_layers = None

        return rval, formatted_noise, conditional_data, None  # , other_layers

    def sample(self, conditional_data, **kwargs):
        sample, _, _, _ = self.sample_and_noise(conditional_data, **kwargs)
        return sample

    def get_monitoring_channels(self, data):
        if data is None:
            m = 100
            conditional_data = self.condition_distribution.sample(m)
        else:
            _, conditional_data = data
            m = conditional_data.shape[0]

        noise = self.get_noise((m, self.noise_dim))
        rval = OrderedDict()

        sampled_data = (noise, conditional_data)
        try:
            rval.update(self.mlp.get_monitoring_channels((sampled_data, None)))
        except Exception:
            warnings.warn(
                "something went wrong with generator.mlp's monitoring channels"
            )

        if self.monitor_ll:
            rval['ll'] = T.cast(
                self.ll(data, self.ll_n_samples, self.ll_sigma),
                theano.config.floatX).mean()
            rval['nll'] = -rval['ll']
        return rval

    def ll(self, data, n_samples, sigma):
        real_data, conditional_data = data
        sampled_data = self.sample(conditional_data)

        output_space = self.mlp.get_output_space()
        if 'Conv2D' in str(output_space):
            samples = output_space.convert(sampled_data, output_space.axes,
                                           ('b', 0, 1, 'c'))
            samples = samples.flatten(2)
            data = output_space.convert(real_data, output_space.axes,
                                        ('b', 0, 1, 'c'))
            data = data.flatten(2)
        parzen = theano_parzen(data, samples, sigma)
        return parzen
Пример #12
0
class MultiSoftmax(Layer):

    def __init__(self, n_groups, n_classes, layer_name, irange = None,
                 istdev = None, sparse_init = None, W_lr_scale = None,
                 b_lr_scale = None, max_row_norm = None,
                 no_affine = False, max_col_norm = None):
        """
        """

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        self.__dict__.update(locals())
        del self.self

        assert isinstance(n_classes, py_integer_types)

        self.output_space = MatrixSpace(n_groups, n_classes)
        self.b = sharedX( np.zeros((n_groups, n_classes,)), name = 'softmax_b')

    def get_lr_scalers(self):

        rval = OrderedDict()

        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        if not hasattr(self, 'b_lr_scale'):
            self.b_lr_scale = None

        if self.b_lr_scale is not None:
            assert isinstance(self.b_lr_scale, float)
            rval[self.b] = self.b_lr_scale

        return rval

    def get_monitoring_channels(self):
        return OrderedDict()

    def get_monitoring_channels_from_state(self, state, target=None):
        return OrderedDict()
        
    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        if self.no_affine:
            desired_dim = self.n_classes
            assert self.input_dim == desired_dim
        else:
            desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.irange is not None:
            assert self.istdev is None
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_groups,self.n_classes))
        elif self.istdev is not None:
            assert self.sparse_init is None
            W = rng.randn(self.input_dim,self.n_groups,self.n_classes) * self.istdev
        else:
            raise NotImplementedError()

        self.W = sharedX(W,  'softmax_W' )

        self._params = [ self.b, self.W ]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):
        self.W.set_value(weights)

    def set_biases(self, biases):
        self.b.set_value(biases)

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def fprop(self, state_below):

        self.input_space.validate(state_below)

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        for value in get_debug_values(state_below):
            if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))

        self.desired_space.validate(state_below)
        assert state_below.ndim == 2

        assert self.W.ndim == 3

        Z = T.tensordot(state_below, self.W, axes=[[1],[0]]) + self.b

        rval = batched_softmax(Z)

        for value in get_debug_values(rval):
            if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size

        return rval

    def cost(self, Y, Y_hat):
        return self.cost_from_cost_matrix(self.cost_matrix(Y, Y_hat))

    def cost_from_cost_matrix(self, cost_matrix):
        return cost_matrix.sum(axis=2).mean()

    def cost_matrix(self, Y, Y_hat):
        return -Y * T.log(Y_hat)

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        return coeff * T.sqr(self.W).sum()

    def get_l1_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W = self.W
        return coeff * abs(W).sum()

    def censor_updates(self, updates):
        return
        if self.max_row_norm is not None:
            W = self.W
            if W in updates:
                updated_W = updates[W]
                row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1))
                desired_norms = T.clip(row_norms, 0, self.max_row_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + row_norms)).dimshuffle(0, 'x')
        if self.max_col_norm is not None:
            assert self.max_row_norm is None
            W = self.W
            if W in updates:
                updated_W = updates[W]
                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
Пример #13
0
class BoltzmannIsingHidden(HiddenLayer):
    """

    A hidden layer with h being a vector in {-1, 1}^dim,
    implementing the energy function term

    -v^T Wh -b^T h

    where W and b are parameters of this layer, and v is
    the upward state of the layer below

    """

    def __init__(self,
            dim,
            layer_name,
            layer_below,
            irange = None,
            sparse_init = None,
            sparse_stdev = 1.,
            include_prob = 1.0,
            init_bias = 0.,
            W_lr_scale = None,
            b_lr_scale = None,
            max_col_norm = None,
            min_ising_b = None,
            max_ising_b = None,
            min_ising_W = None,
            max_ising_W = None,
            sampling_W_stdev = None,
            sampling_b_stdev = None):
        """

            include_prob: probability of including a weight element in the set
                    of weights initialized to U(-irange, irange). If not included
                    it is initialized to 0.

        """
        self.__dict__.update(locals())
        del self.self

        self.boltzmann_b = sharedX( np.zeros((self.dim,)) + init_bias, name = layer_name + '_b')
        layer_below.layer_above = self

    def get_lr_scalers(self):

        if not hasattr(self, 'W_lr_scale'):
            self.W_lr_scale = None

        if not hasattr(self, 'b_lr_scale'):
            self.b_lr_scale = None

        rval = OrderedDict()

        if self.W_lr_scale is not None:
            W = self.W
            rval[W] = self.W_lr_scale

        if self.b_lr_scale is not None:
            rval[self.boltzmann_b] = self.b_lr_scale

        return rval

    def set_input_space(self, space):
        """ Note: this resets parameters! """

        self.input_space = space

        if isinstance(space, VectorSpace):
            self.requires_reformat = False
            self.input_dim = space.dim
        else:
            self.requires_reformat = True
            self.input_dim = space.get_total_dimension()
            self.desired_space = VectorSpace(self.input_dim)

        self.output_space = VectorSpace(self.dim)

        rng = self.dbm.rng
        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,
                                 self.irange,
                                 (self.input_dim, self.dim)) * \
                    (rng.uniform(0.,1., (self.input_dim, self.dim))
                     < self.include_prob)
        else:
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.dim))
            W *= self.sparse_stdev

        W = sharedX(W)
        W.name = self.layer_name + '_W'

        self.W = W

        if self.sampling_b_stdev is not None:
            self.noisy_sampling_b = sharedX(np.zeros((self.dbm.batch_size, self.dim)))
            self.layer_below.noisy_sampling_b = sharedX(np.zeros((self.dbm.batch_size, self.layer_below.nvis)))
        if self.sampling_W_stdev is not None:
            self.noisy_sampling_W = sharedX(np.zeros((self.input_dim, self.dim)), 'noisy_sampling_W')

        updates = OrderedDict()
        updates[self.boltzmann_b] = self.boltzmann_b
        updates[self.W] = self.W
        updates[self.layer_below.boltzmann_bias] = self.layer_below.boltzmann_bias
        self.censor_updates(updates)
        f = function([], updates=updates)
        f()

    def censor_updates(self, updates):

        if self.max_col_norm is not None:
            W = self.W
            if W in updates:
                updated_W = updates[W]
                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))

        if any(constraint is not None for constraint in [self.min_ising_b, self.max_ising_b,
            self.min_ising_W, self.max_ising_W]):
            assert not hasattr(self.layer_below, 'layer_below')
            bmn = self.min_ising_b
            if bmn is None:
                bmn = - 1e6
            bmx = self.max_ising_b
            if bmx is None:
                bmx = 1e6
            wmn = self.min_ising_W
            if wmn is None:
                wmn = - 1e6
            wmx = self.max_ising_W
            if wmx is None:
                wmx = 1e6

            W = updates[self.W]
            ising_W = 0.25 * W
            ising_W = T.clip(ising_W, wmn, wmx)

            bv = updates[self.layer_below.boltzmann_bias]
            ising_bv = 0.5 * bv + 0.25 * W.sum(axis=1)
            ising_bv = T.clip(ising_bv, bmn, bmx)

            bh = updates[self.boltzmann_b]
            ising_bh = 0.5 * bh + 0.25 * W.sum(axis=0)
            ising_bh = T.clip(ising_bh, bmn, bmx)


            Wn = 4. * ising_W
            bvn = 2. * (ising_bv - ising_W.sum(axis=1))
            bhn = 2. * (ising_bh - ising_W.sum(axis=0))

            updates[self.W] = Wn
            updates[self.layer_below.boltzmann_bias] = bvn
            updates[self.boltzmann_b] = bhn

        if self.noisy_sampling_W is not None:
            theano_rng = MRG_RandomStreams(self.dbm.rng.randint(2**16))
            bmn = self.min_ising_b
            if bmn is None:
                bmn = - 1e6
            bmx = self.max_ising_b
            if bmx is None:
                bmx = 1e6
            wmn = self.min_ising_W
            if wmn is None:
                wmn = - 1e6
            wmx = self.max_ising_W
            if wmx is None:
                wmx = 1e6
            W = updates[self.W]
            ising_W = 0.25 * W
            noisy_sampling_W = theano_rng.normal(avg=ising_W, std=self.sampling_W_stdev, size=ising_W.shape, dtype=ising_W.dtype)
            updates[self.noisy_sampling_W] = noisy_sampling_W

            bv = updates[self.layer_below.boltzmann_bias]
            ising_bv = 0.5 * bv + 0.25 * W.sum(axis=1)
            noisy_sampling_bv = theano_rng.normal(avg=ising_bv.dimshuffle('x', 0), std=self.sampling_b_stdev,
                    size=self.layer_below.noisy_sampling_b.shape, dtype=ising_bv.dtype)
            updates[self.layer_below.noisy_sampling_b] = noisy_sampling_bv

            bh = updates[self.boltzmann_b]
            ising_bh = 0.5 * bh + 0.25 * W.sum(axis=0)
            noisy_sampling_bh = theano_rng.normal(avg=ising_bh.dimshuffle('x', 0), std=self.sampling_b_stdev, size = self.noisy_sampling_b.shape, dtype=ising_bh.dtype)
            updates[self.noisy_sampling_b] = noisy_sampling_bh


    def get_total_state_space(self):
        return VectorSpace(self.dim)

    def get_params(self):
        assert self.boltzmann_b.name is not None
        W = self.W
        assert W.name is not None
        rval = [W]
        assert not isinstance(rval, set)
        rval = list(rval)
        assert self.boltzmann_b not in rval
        rval.append(self.boltzmann_b)
        return rval

    def ising_weights(self, for_sampling=False):
        if not hasattr(self, 'sampling_W_stdev'):
            self.sampling_W_stdev = None
        if for_sampling and self.sampling_W_stdev is not None:
            return self.noisy_sampling_W
        return 0.25 * self.W

    def ising_b(self, for_sampling=False):
        if hasattr(self, 'layer_above'):
            raise NotImplementedError()
        if not hasattr(self, 'sampling_b_stdev'):
            self.sampling_b_stdev = None
        if for_sampling and self.sampling_b_stdev is not None:
            return self.noisy_sampling_b
        return 0.5 * self.boltzmann_b + 0.25 * self.W.sum(axis=0)

    def ising_b_numpy(self):
        if hasattr(self, 'layer_above'):
            raise NotImplementedError()
        return 0.5 * self.boltzmann_b.get_value() + 0.25 * self.W.get_value().sum(axis=0)

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W = self.W
        return coeff * T.sqr(W).sum()

    def get_weights(self):
        warnings.warn("BoltzmannIsingHidden.get_weights returns the BOLTZMANN weights, is that what we want?")
        W = self.W
        return W.get_value()

    def set_weights(self, weights):
        warnings.warn("BoltzmannIsingHidden.set_weights sets the BOLTZMANN weights, is that what we want?")
        W = self.W
        W.set_value(weights)

    def set_biases(self, biases, recenter = False):
        assert False # not really sure what this should do

    def get_biases(self):
        assert False # not really sure what this should do

    def get_weights_format(self):
        return ('v', 'h')

    def get_weights_topo(self):
        warnings.warn("BoltzmannIsingHidden.get_weights_topo returns the BOLTZMANN weights, is that what we want?")

        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()

        W = self.W

        W = W.T

        W = W.reshape((self.detector_layer_dim, self.input_space.shape[0],
            self.input_space.shape[1], self.input_space.nchannels))

        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))

        return function([], W)()

    def upward_state(self, total_state):
        return total_state

    def downward_state(self, total_state):
        return total_state

    def get_monitoring_channels(self):

        W = self.W

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        rval =  OrderedDict([
              ('boltzmann_row_norms_min'  , row_norms.min()),
              ('boltzmann_row_norms_mean' , row_norms.mean()),
              ('boltzmann_row_norms_max'  , row_norms.max()),
              ('boltzmann_col_norms_min'  , col_norms.min()),
              ('boltzmann_col_norms_mean' , col_norms.mean()),
              ('boltzmann_col_norms_max'  , col_norms.max()),
            ])

        ising_W = self.ising_weights()

        rval['ising_W_min'] = ising_W.min()
        rval['ising_W_max'] = ising_W.max()

        ising_b = self.ising_b()

        rval['ising_b_min'] = ising_b.min()
        rval['ising_b_max'] = ising_b.max()

        if hasattr(self, 'noisy_sampling_W'):
            rval['noisy_sampling_W_min'] = self.noisy_sampling_W.min()
            rval['noisy_sampling_W_max'] = self.noisy_sampling_W.max()
            rval['noisy_sampling_b_min'] = self.noisy_sampling_b.min()
            rval['noisy_sampling_b_max'] = self.noisy_sampling_b.max()

        return rval

    def get_monitoring_channels_from_state(self, state):

        P = state

        rval = OrderedDict()

        vars_and_prefixes = [ (P,'') ]

        for var, prefix in vars_and_prefixes:
            v_max = var.max(axis=0)
            v_min = var.min(axis=0)
            v_mean = var.mean(axis=0)
            v_range = v_max - v_min

            # max_x.mean_u is "the mean over *u*nits of the max over e*x*amples"
            # The x and u are included in the name because otherwise its hard
            # to remember which axis is which when reading the monitor
            # I use inner.outer rather than outer_of_inner or something like that
            # because I want mean_x.* to appear next to each other in the alphabetical
            # list, as these are commonly plotted together
            for key, val in [
                    ('max_x.max_u', v_max.max()),
                    ('max_x.mean_u', v_max.mean()),
                    ('max_x.min_u', v_max.min()),
                    ('min_x.max_u', v_min.max()),
                    ('min_x.mean_u', v_min.mean()),
                    ('min_x.min_u', v_min.min()),
                    ('range_x.max_u', v_range.max()),
                    ('range_x.mean_u', v_range.mean()),
                    ('range_x.min_u', v_range.min()),
                    ('mean_x.max_u', v_mean.max()),
                    ('mean_x.mean_u', v_mean.mean()),
                    ('mean_x.min_u', v_mean.min())
                    ]:
                rval[prefix+key] = val

        return rval

    def sample(self, state_below = None, state_above = None,
            layer_above = None,
            theano_rng = None):

        if theano_rng is None:
            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")

        if state_above is not None:
            msg = layer_above.downward_message(state_above, for_sampling=True)
        else:
            msg = None

        if self.requires_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        z = T.dot(state_below, self.ising_weights(for_sampling=True)) + self.ising_b(for_sampling=True)

        if msg != None:
            z = z + msg

        on_prob = T.nnet.sigmoid(2. * z)

        samples = theano_rng.binomial(p = on_prob, n=1, size=on_prob.shape, dtype=on_prob.dtype) * 2. - 1.

        return samples

    def downward_message(self, downward_state, for_sampling=False):
        rval = T.dot(downward_state, self.ising_weights(for_sampling=False).T)

        if self.requires_reformat:
            rval = self.desired_space.format_as(rval, self.input_space)

        return rval

    def init_mf_state(self):
        raise NotImplementedError("This is just a copy-paste of BVMP")
        # work around theano bug with broadcasted vectors
        z = T.alloc(0., self.dbm.batch_size, self.detector_layer_dim).astype(self.boltzmann_b.dtype) + \
                self.ising_b().dimshuffle('x', 0)
        rval = max_pool_channels(z = z,
                pool_size = self.pool_size)
        return rval

    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.
        """
        driver = numpy_rng.uniform(0.,1., (num_examples, self.dim))
        on_prob = sigmoid_numpy(2. * self.ising_b_numpy())
        sample = 2. * (driver < on_prob) - 1.

        rval = sharedX(sample, name = 'v_sample_shared')

        return rval

    def make_symbolic_state(self, num_examples, theano_rng):
        mean = T.nnet.sigmoid(2. * self.ising_b())
        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean)
        rval = 2. * (rval) - 1.

        return rval

    def expected_energy_term(self, state, average, state_below, average_below):

        # state = Print('h_state', attrs=['min', 'max'])(state)

        self.input_space.validate(state_below)

        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below, self.desired_space)

        # Energy function is linear so it doesn't matter if we're averaging or not
        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
        # and d is the downward state of this layer

        bias_term = T.dot(state, self.ising_b())
        weights_term = (T.dot(state_below, self.ising_weights()) * state).sum(axis=1)

        rval = -bias_term - weights_term

        assert rval.ndim == 1

        return rval

    def linear_feed_forward_approximation(self, state_below):
        """
        Used to implement TorontoSparsity. Unclear exactly what properties of it are
        important or how to implement it for other layers.

        Properties it must have:
            output is same kind of data structure (ie, tuple of theano 2-tensors)
            as mf_update

        Properties it probably should have for other layer types:
            An infinitesimal change in state_below or the parameters should cause the same sign of change
            in the output of linear_feed_forward_approximation and in mf_update

            Should not have any non-linearities that cause the gradient to shrink

            Should disregard top-down feedback
        """

        z = T.dot(state_below, self.ising_weights()) + self.ising_b()

        return z

    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):

        self.input_space.validate(state_below)

        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below, self.desired_space)

        if iter_name is None:
            iter_name = 'anon'

        if state_above is not None:
            assert layer_above is not None
            msg = layer_above.downward_message(state_above)
            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
        else:
            msg = None

        if double_weights:
            state_below = 2. * state_below
            state_below.name = self.layer_name + '_'+iter_name + '_2state'
        z = T.dot(state_below, self.ising_weights()) + self.ising_b()
        if self.layer_name is not None and iter_name is not None:
            z.name = self.layer_name + '_' + iter_name + '_z'
        if msg is not None:
            z = z + msg
        h = T.tanh(z)

        return h

    def get_l2_act_cost(self, state, target, coeff):
        avg = state.mean(axis=0)
        diff = avg - target
        return coeff * T.sqr(diff).mean()
Пример #14
0
class BinaryVectorMaxPool(HiddenLayer):
    """
        A hidden layer that does max-pooling on binary vectors.
        It has two sublayers, the detector layer and the pooling
        layer. The detector layer is its downward state and the pooling
        layer is its upward state.

        TODO: this layer uses (pooled, detector) as its total state,
              which can be confusing when listing all the states in
              the network left to right. Change this and
              pylearn2.expr.probabilistic_max_pooling to use
              (detector, pooled)
    """

    def __init__(self,
             detector_layer_dim,
            pool_size,
            layer_name,
            irange = None,
            sparse_init = None,
            include_prob = 1.0,
            init_bias = 0.):
        """

            include_prob: probability of including a weight element in the set
                    of weights initialized to U(-irange, irange). If not included
                    it is initialized to 0.

        """
        self.__dict__.update(locals())
        del self.self

        self.b = sharedX( np.zeros((self.detector_layer_dim,)) + init_bias, name = layer_name + '_b')

    def set_input_space(self, space):
        """ Note: this resets parameters! """

        self.input_space = space

        if isinstance(space, VectorSpace):
            self.requires_reformat = False
            self.input_dim = space.dim
        else:
            self.requires_reformat = True
            self.input_dim = space.get_total_dimension()
            self.desired_space = VectorSpace(self.input_dim)


        if not (self.detector_layer_dim % self.pool_size == 0):
            raise ValueError("detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" %
                    (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size))

        self.h_space = VectorSpace(self.detector_layer_dim)
        self.pool_layer_dim = self.detector_layer_dim / self.pool_size
        self.output_space = VectorSpace(self.pool_layer_dim)

        rng = self.dbm.rng
        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,
                                 self.irange,
                                 (self.input_dim, self.detector_layer_dim)) * \
                    (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim))
                     < self.include_prob)
        else:
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.detector_layer_dim))
            for i in xrange(self.detector_layer_dim):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        W = sharedX(W)
        W.name = self.layer_name + '_W'

        self.transformer = MatrixMul(W)

        W ,= self.transformer.get_params()
        assert W.name is not None

    def get_total_state_space(self):
        return CompositeSpace((self.output_space, self.h_space))

    def get_params(self):
        assert self.b.name is not None
        W ,= self.transformer.get_params()
        assert W.name is not None
        return self.transformer.get_params().union([self.b])

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float)
        W ,= self.transformer.get_params()
        return coeff * T.sqr(W).sum()

    def get_weights(self):
        if self.requires_reformat:
            # This is not really an unimplemented case.
            # We actually don't know how to format the weights
            # in design space. We got the data in topo space
            # and we don't have access to the dataset
            raise NotImplementedError()
        W ,= self.transformer.get_params()
        return W.get_value()

    def set_weights(self, weights):
        W, = self.transformer.get_params()
        W.set_value(weights)

    def set_biases(self, biases):
        self.b.set_value(biases)

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def get_weights_view_shape(self):
        total = self.detector_layer_dim
        cols = self.pool_size
        if cols == 1:
            # Let the PatchViewer decidew how to arrange the units
            # when they're not pooled
            raise NotImplementedError()
        # When they are pooled, make each pooling unit have one row
        rows = total / cols
        return rows, cols


    def get_weights_topo(self):

        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()

        W ,= self.transformer.get_params()

        W = W.T

        W = W.reshape((self.detector_layer_dim, self.input_space.shape[0],
            self.input_space.shape[1], self.input_space.nchannels))

        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))

        return function([], W)()

    def upward_state(self, total_state):
        p,h = total_state
        self.h_space.validate(h)
        self.output_space.validate(p)
        return p

    def downward_state(self, total_state):
        p,h = total_state
        return h

    def get_monitoring_channels_from_state(self, state):

        P, H = state

        rval ={}

        if self.pool_size == 1:
            vars_and_prefixes = [ (P,'') ]
        else:
            vars_and_prefixes = [ (P, 'p_'), (H, 'h_') ]

        for var, prefix in vars_and_prefixes:
            v_max = var.max(axis=0)
            v_min = var.min(axis=0)
            v_mean = var.mean(axis=0)
            v_range = v_max - v_min

            for key, val in [
                    ('max_max', v_max.max()),
                    ('max_mean', v_max.mean()),
                    ('max_min', v_max.min()),
                    ('min_max', v_min.max()),
                    ('min_mean', v_min.mean()),
                    ('min_max', v_min.max()),
                    ('range_max', v_range.max()),
                    ('range_mean', v_range.mean()),
                    ('range_min', v_range.min()),
                    ('mean_max', v_mean.max()),
                    ('mean_mean', v_mean.mean()),
                    ('mean_min', v_mean.min())
                    ]:
                rval[prefix+key] = val

        return rval


    def get_l1_act_cost(self, state, target, coeff, eps = None):
        rval = 0.

        P, H = state
        self.output_space.validate(P)
        self.h_space.validate(H)


        if self.pool_size == 1:
            # If the pool size is 1 then pools = detectors
            # and we should not penalize pools and detectors separately
            assert len(state) == 2
            assert isinstance(target, float)
            assert isinstance(coeff, float)
            _, state = state
            state = [state]
            target = [target]
            coeff = [coeff]
            if eps is None:
                eps = [0.]
            else:
                eps = [eps]
        else:
            assert all([len(elem) == 2 for elem in [state, target, coeff]])
            if eps is None:
                eps = [0., 0.]
            if target[1] < target[0]:
                warnings.warn("Do you really want to regularize the detector units to be sparser than the pooling units?")

        for s, t, c, e in safe_zip(state, target, coeff, eps):
            assert all([isinstance(elem, float) for elem in [t, c, e]])
            if c == 0.:
                continue
            m = s.mean(axis=0)
            assert m.ndim == 1
            rval += T.maximum(abs(m-t)-e,0.).mean()*c

        return rval

    def sample(self, state_below = None, state_above = None,
            layer_above = None,
            theano_rng = None):

        if theano_rng is None:
            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")

        if state_above is not None:
            msg = layer_above.downward_message(state_above)
        else:
            msg = None

        if self.requires_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        z = self.transformer.lmul(state_below) + self.b
        p, h, p_sample, h_sample = max_pool_channels(z,
                self.pool_size, msg, theano_rng)

        return p_sample, h_sample

    def downward_message(self, downward_state):
        rval = self.transformer.lmul_T(downward_state)

        if self.requires_reformat:
            rval = self.desired_space.format_as(rval, self.input_space)

        return rval

    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.
        """

        t1 = time.time()

        empty_input = self.h_space.get_origin_batch(num_examples)
        h_state = sharedX(empty_input)

        default_z = T.zeros_like(h_state) + self.b

        theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16))

        p_exp, h_exp, p_sample, h_sample = max_pool_channels(
                z = default_z,
                pool_size = self.pool_size,
                theano_rng = theano_rng)

        assert h_sample.dtype == default_z.dtype

        p_state = sharedX( self.output_space.get_origin_batch(
            num_examples))

        t2 = time.time()

        f = function([], updates = {
            p_state : p_sample,
            h_state : h_sample
            })

        t3 = time.time()

        f()

        t4 = time.time()

        print str(self)+'.make_state took',t4-t1
        print '\tcompose time:',t2-t1
        print '\tcompile time:',t3-t2
        print '\texecute time:',t4-t3

        p_state.name = 'p_sample_shared'
        h_state.name = 'h_sample_shared'

        return p_state, h_state

    def expected_energy_term(self, state, average, state_below, average_below):

        self.input_space.validate(state_below)

        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below, self.desired_space)

        downward_state = self.downward_state(state)
        self.h_space.validate(downward_state)

        # Energy function is linear so it doesn't matter if we're averaging or not
        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
        # and d is the downward state of this layer

        bias_term = T.dot(downward_state, self.b)
        weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1)

        rval = -bias_term - weights_term

        assert rval.ndim == 1

        return rval

    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):

        self.input_space.validate(state_below)

        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below, self.desired_space)

        if iter_name is None:
            iter_name = 'anon'

        if state_above is not None:
            assert layer_above is not None
            msg = layer_above.downward_message(state_above)
            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
        else:
            msg = None

        if double_weights:
            state_below = 2. * state_below
            state_below.name = self.layer_name + '_'+iter_name + '_2state'
        z = self.transformer.lmul(state_below) + self.b
        if self.layer_name is not None and iter_name is not None:
            z.name = self.layer_name + '_' + iter_name + '_z'
        p,h = max_pool_channels(z, self.pool_size, msg)

        p.name = self.layer_name + '_p_' + iter_name
        h.name = self.layer_name + '_h_' + iter_name

        return p, h
Пример #15
0
class IsingHidden(HiddenLayer):
    """

    A hidden layer with h being a vector in {-1, 1}^dim,
    implementing the energy function term

    -v^T Wh -b^T h

    where W and b are parameters of this layer, and v is
    the upward state of the layer below

    """

    def __init__(self,
            dim,
            layer_name,
            irange = None,
            sparse_init = None,
            sparse_stdev = 1.,
            include_prob = 1.0,
            init_bias = 0.,
            W_lr_scale = None,
            b_lr_scale = None,
            max_col_norm = None):
        """

            include_prob: probability of including a weight element in the set
                    of weights initialized to U(-irange, irange). If not included
                    it is initialized to 0.

        """
        self.__dict__.update(locals())
        del self.self

        self.b = sharedX( np.zeros((self.dim,)) + init_bias, name = layer_name + '_b')

    def get_lr_scalers(self):

        if not hasattr(self, 'W_lr_scale'):
            self.W_lr_scale = None

        if not hasattr(self, 'b_lr_scale'):
            self.b_lr_scale = None

        rval = OrderedDict()

        if self.W_lr_scale is not None:
            W, = self.transformer.get_params()
            rval[W] = self.W_lr_scale

        if self.b_lr_scale is not None:
            rval[self.b] = self.b_lr_scale

        return rval

    def set_input_space(self, space):
        """ Note: this resets parameters! """

        self.input_space = space

        if isinstance(space, VectorSpace):
            self.requires_reformat = False
            self.input_dim = space.dim
        else:
            self.requires_reformat = True
            self.input_dim = space.get_total_dimension()
            self.desired_space = VectorSpace(self.input_dim)

        self.output_space = VectorSpace(self.dim)

        rng = self.dbm.rng
        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,
                                 self.irange,
                                 (self.input_dim, self.dim)) * \
                    (rng.uniform(0.,1., (self.input_dim, self.dim))
                     < self.include_prob)
        else:
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.dim))
            W *= self.sparse_stdev

        W = sharedX(W)
        W.name = self.layer_name + '_W'

        self.transformer = MatrixMul(W)

        W ,= self.transformer.get_params()
        assert W.name is not None

    def censor_updates(self, updates):

        if self.max_col_norm is not None:
            W, = self.transformer.get_params()
            if W in updates:
                updated_W = updates[W]
                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))

    def get_total_state_space(self):
        return VectorSpace(self.dim)

    def get_params(self):
        assert self.b.name is not None
        W ,= self.transformer.get_params()
        assert W.name is not None
        rval = self.transformer.get_params()
        assert not isinstance(rval, set)
        rval = list(rval)
        assert self.b not in rval
        rval.append(self.b)
        return rval

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W ,= self.transformer.get_params()
        return coeff * T.sqr(W).sum()

    def get_weights(self):
        if self.requires_reformat:
            # This is not really an unimplemented case.
            # We actually don't know how to format the weights
            # in design space. We got the data in topo space
            # and we don't have access to the dataset
            raise NotImplementedError()
        W ,= self.transformer.get_params()
        return W.get_value()

    def set_weights(self, weights):
        W, = self.transformer.get_params()
        W.set_value(weights)

    def set_biases(self, biases, recenter = False):
        self.b.set_value(biases)
        if recenter:
            assert self.center
            if self.pool_size != 1:
                raise NotImplementedError()
            self.offset.set_value(sigmoid_numpy(self.b.get_value()))

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def get_weights_topo(self):

        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()

        W ,= self.transformer.get_params()

        W = W.T

        W = W.reshape((self.detector_layer_dim, self.input_space.shape[0],
            self.input_space.shape[1], self.input_space.nchannels))

        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))

        return function([], W)()

    def upward_state(self, total_state):
        return total_state

    def downward_state(self, total_state):
        return total_state

    def get_monitoring_channels(self):

        W ,= self.transformer.get_params()

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        return OrderedDict([
              ('row_norms_min'  , row_norms.min()),
              ('row_norms_mean' , row_norms.mean()),
              ('row_norms_max'  , row_norms.max()),
              ('col_norms_min'  , col_norms.min()),
              ('col_norms_mean' , col_norms.mean()),
              ('col_norms_max'  , col_norms.max()),
            ])

    def get_monitoring_channels_from_state(self, state):

        P = state

        rval = OrderedDict()

        vars_and_prefixes = [ (P,'') ]

        for var, prefix in vars_and_prefixes:
            v_max = var.max(axis=0)
            v_min = var.min(axis=0)
            v_mean = var.mean(axis=0)
            v_range = v_max - v_min

            # max_x.mean_u is "the mean over *u*nits of the max over e*x*amples"
            # The x and u are included in the name because otherwise its hard
            # to remember which axis is which when reading the monitor
            # I use inner.outer rather than outer_of_inner or something like that
            # because I want mean_x.* to appear next to each other in the alphabetical
            # list, as these are commonly plotted together
            for key, val in [
                    ('max_x.max_u', v_max.max()),
                    ('max_x.mean_u', v_max.mean()),
                    ('max_x.min_u', v_max.min()),
                    ('min_x.max_u', v_min.max()),
                    ('min_x.mean_u', v_min.mean()),
                    ('min_x.min_u', v_min.min()),
                    ('range_x.max_u', v_range.max()),
                    ('range_x.mean_u', v_range.mean()),
                    ('range_x.min_u', v_range.min()),
                    ('mean_x.max_u', v_mean.max()),
                    ('mean_x.mean_u', v_mean.mean()),
                    ('mean_x.min_u', v_mean.min())
                    ]:
                rval[prefix+key] = val

        return rval

    def sample(self, state_below = None, state_above = None,
            layer_above = None,
            theano_rng = None):

        if theano_rng is None:
            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")

        if state_above is not None:
            msg = layer_above.downward_message(state_above)
        else:
            msg = None

        if self.requires_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        z = self.transformer.lmul(state_below) + self.b

        if msg != None:
            z = z + msg

        on_prob = T.nnet.sigmoid(2. * z)

        samples = theano_rng.binomial(p = on_prob, n=1, size=on_prob.shape, dtype=on_prob.dtype) * 2. - 1.

        return samples

    def downward_message(self, downward_state):
        rval = self.transformer.lmul_T(downward_state)

        if self.requires_reformat:
            rval = self.desired_space.format_as(rval, self.input_space)

        return rval

    def init_mf_state(self):
        raise NotImplementedError("This is just a copy-paste of BVMP")
        # work around theano bug with broadcasted vectors
        z = T.alloc(0., self.dbm.batch_size, self.detector_layer_dim).astype(self.b.dtype) + \
                self.b.dimshuffle('x', 0)
        rval = max_pool_channels(z = z,
                pool_size = self.pool_size)
        return rval

    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.
        """
        driver = numpy_rng.uniform(0.,1., (num_examples, self.dim))
        on_prob = sigmoid_numpy(2. * self.b.get_value())
        sample = 2. * (driver < on_prob) - 1.

        rval = sharedX(sample, name = 'v_sample_shared')

        return rval

    def expected_energy_term(self, state, average, state_below, average_below):

        # state = Print('h_state', attrs=['min', 'max'])(state)

        self.input_space.validate(state_below)

        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below, self.desired_space)

        # Energy function is linear so it doesn't matter if we're averaging or not
        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
        # and d is the downward state of this layer

        bias_term = T.dot(state, self.b)
        weights_term = (self.transformer.lmul(state_below) * state).sum(axis=1)

        rval = -bias_term - weights_term

        assert rval.ndim == 1

        return rval

    def linear_feed_forward_approximation(self, state_below):
        """
        Used to implement TorontoSparsity. Unclear exactly what properties of it are
        important or how to implement it for other layers.

        Properties it must have:
            output is same kind of data structure (ie, tuple of theano 2-tensors)
            as mf_update

        Properties it probably should have for other layer types:
            An infinitesimal change in state_below or the parameters should cause the same sign of change
            in the output of linear_feed_forward_approximation and in mf_update

            Should not have any non-linearities that cause the gradient to shrink

            Should disregard top-down feedback
        """

        z = self.transformer.lmul(state_below) + self.b

        if self.pool_size != 1:
            # Should probably implement sum pooling for the non-pooled version,
            # but in reality it's not totally clear what the right answer is
            raise NotImplementedError()

        return z, z

    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):

        self.input_space.validate(state_below)

        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below, self.desired_space)

        if iter_name is None:
            iter_name = 'anon'

        if state_above is not None:
            assert layer_above is not None
            msg = layer_above.downward_message(state_above)
            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
        else:
            msg = None

        if double_weights:
            state_below = 2. * state_below
            state_below.name = self.layer_name + '_'+iter_name + '_2state'
        z = self.transformer.lmul(state_below) + self.b
        if self.layer_name is not None and iter_name is not None:
            z.name = self.layer_name + '_' + iter_name + '_z'
        if msg is not None:
            z = z + msg
        h = T.tanh(z)

        return h
Пример #16
0
class ToyRNNPhone(Model):
    """
    WRITEME
    """
    def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05,
                 non_linearity='sigmoid', use_ground_truth=True):
        allowed_non_linearities = {'sigmoid': T.nnet.sigmoid,
                                   'tanh': T.tanh}
        self.nvis = nvis
        self.nhid = nhid
        self.hidden_transition_model = hidden_transition_model
        self.use_ground_truth = use_ground_truth
        self.alpha = sharedX(1)
        self.alpha_decrease_rate = 0.999

        assert non_linearity in allowed_non_linearities
        self.non_linearity = allowed_non_linearities[non_linearity]

        # Space initialization
        self.input_space = VectorSpace(dim=self.nvis)
        self.hidden_space = VectorSpace(dim=self.nhid)
        self.output_space = VectorSpace(dim=1)
        self.input_source = 'features'
        self.target_source = 'targets'

        # Features-to-hidden matrix
        W_value = numpy.random.uniform(low=-irange, high=irange,
                                       size=(self.nvis, self.nhid))
        self.W = sharedX(W_value, name='W')
        # Hidden biases
        b_value = numpy.zeros(self.nhid)
        self.b = sharedX(b_value, name='b')
        # Hidden-to-out matrix
        U_value = numpy.random.uniform(low=-irange, high=irange,
                                       size=(self.nhid, 1))
        self.U = sharedX(U_value, name='U')
        # Output bias
        c_value = numpy.zeros(1)
        self.c = sharedX(c_value, name='c')

    def fprop_step(self, features, h_tm1, out):
        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
                                            self.hidden_transition_model.input_space)
        h = T.nnet.sigmoid(T.dot(features, self.W) +
                           self.hidden_transition_model.fprop(h_tm1).flatten() +
                           self.b)
        out = T.dot(h, self.U) + self.c
        return h, out

    def fprop_step_prime(self, truth, features, h_tm1, out):
        features = T.set_subtensor(features[-1], (1 - self.alpha) *
                                   features[-1] + self.alpha * truth[-1])
        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
                                            self.hidden_transition_model.input_space)
        h = T.nnet.sigmoid(T.dot(features, self.W) +
                           self.hidden_transition_model.fprop(h_tm1).flatten() +
                           self.b)
        out = T.dot(h, self.U) + self.c
        features = T.concatenate([features[1:], out])
        return features, h, out

    def fprop(self, data):
        if self.use_ground_truth:
            self.input_space.validate(data)
            features = data

            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda f, h, o: self.fprop_step(f, h, o)

            ((h, out), updates) = theano.scan(fn=fn,
                                              sequences=[features],
                                              outputs_info=[dict(initial=init_h,
                                                                 taps=[-1]),
                                                            init_out])
            return out
        else:
            self.input_space.validate(data)
            features = data

            init_in = features[0]
            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda t, f, h, o: self.fprop_step_prime(t, f, h, o)

            ((f, h, out), updates) = theano.scan(fn=fn,
                                                 sequences=[features],
                                                 outputs_info=[init_in,
                                                               dict(initial=init_h,
                                                                    taps=[-1]),
                                                               init_out])
            return out

    def predict_next(self, features, h_tm1):
        h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0),
                                            self.hidden_transition_model.input_space)
        h = T.nnet.sigmoid(T.dot(features, self.W) +
                           self.hidden_transition_model.fprop(h_tm1).flatten() +
                           self.b)
        out = T.dot(h, self.U) + self.c
        return h, out

    def get_params(self):
        return [self.W, self.b, self.U, self.c] + \
               self.hidden_transition_model.get_params()

    def get_input_source(self):
        return self.input_source

    def get_target_source(self):
        return self.target_source

    def censor_updates(self, updates):
        updates[self.alpha] = self.alpha_decrease_rate * self.alpha

    def get_monitoring_channels(self, data):
        rval = OrderedDict()
        rval['alpha'] = self.alpha
        return rval
Пример #17
0
class ToyRNNPhone(Model):
    """
    WRITEME
    """
    def __init__(self,
                 nvis,
                 nhid,
                 hidden_transition_model,
                 irange=0.05,
                 non_linearity='sigmoid',
                 use_ground_truth=True):
        allowed_non_linearities = {'sigmoid': T.nnet.sigmoid, 'tanh': T.tanh}
        self.nvis = nvis
        self.nhid = nhid
        self.hidden_transition_model = hidden_transition_model
        self.use_ground_truth = use_ground_truth
        self.alpha = sharedX(1)
        self.alpha_decrease_rate = 0.999

        assert non_linearity in allowed_non_linearities
        self.non_linearity = allowed_non_linearities[non_linearity]

        # Space initialization
        self.input_space = VectorSpace(dim=self.nvis)
        self.hidden_space = VectorSpace(dim=self.nhid)
        self.output_space = VectorSpace(dim=1)
        self.input_source = 'features'
        self.target_source = 'targets'

        # Features-to-hidden matrix
        W_value = numpy.random.uniform(low=-irange,
                                       high=irange,
                                       size=(self.nvis, self.nhid))
        self.W = sharedX(W_value, name='W')
        # Hidden biases
        b_value = numpy.zeros(self.nhid)
        self.b = sharedX(b_value, name='b')
        # Hidden-to-out matrix
        U_value = numpy.random.uniform(low=-irange,
                                       high=irange,
                                       size=(self.nhid, 1))
        self.U = sharedX(U_value, name='U')
        # Output bias
        c_value = numpy.zeros(1)
        self.c = sharedX(c_value, name='c')

    def fprop_step(self, features, h_tm1, out):
        h_tm1 = self.hidden_space.format_as(
            h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space)
        h = T.nnet.sigmoid(
            T.dot(features, self.W) +
            self.hidden_transition_model.fprop(h_tm1).flatten() + self.b)
        out = T.dot(h, self.U) + self.c
        return h, out

    def fprop_step_prime(self, truth, features, h_tm1, out):
        features = T.set_subtensor(features[-1],
                                   (1 - self.alpha) * features[-1] +
                                   self.alpha * truth[-1])
        h_tm1 = self.hidden_space.format_as(
            h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space)
        h = T.nnet.sigmoid(
            T.dot(features, self.W) +
            self.hidden_transition_model.fprop(h_tm1).flatten() + self.b)
        out = T.dot(h, self.U) + self.c
        features = T.concatenate([features[1:], out])
        return features, h, out

    def fprop(self, data):
        if self.use_ground_truth:
            self.input_space.validate(data)
            features = data

            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda f, h, o: self.fprop_step(f, h, o)

            ((h, out), updates) = theano.scan(
                fn=fn,
                sequences=[features],
                outputs_info=[dict(initial=init_h, taps=[-1]), init_out])
            return out
        else:
            self.input_space.validate(data)
            features = data

            init_in = features[0]
            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda t, f, h, o: self.fprop_step_prime(t, f, h, o)

            ((f, h, out), updates) = theano.scan(fn=fn,
                                                 sequences=[features],
                                                 outputs_info=[
                                                     init_in,
                                                     dict(initial=init_h,
                                                          taps=[-1]), init_out
                                                 ])
            return out

    def predict_next(self, features, h_tm1):
        h_tm1 = self.hidden_space.format_as(
            h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space)
        h = T.nnet.sigmoid(
            T.dot(features, self.W) +
            self.hidden_transition_model.fprop(h_tm1).flatten() + self.b)
        out = T.dot(h, self.U) + self.c
        return h, out

    def get_params(self):
        return [self.W, self.b, self.U, self.c] + \
               self.hidden_transition_model.get_params()

    def get_input_source(self):
        return self.input_source

    def get_target_source(self):
        return self.target_source

    def censor_updates(self, updates):
        updates[self.alpha] = self.alpha_decrease_rate * self.alpha

    def get_monitoring_channels(self, data):
        rval = OrderedDict()
        rval['alpha'] = self.alpha
        return rval
Пример #18
0
def simulate(inputs, model):
    space = VectorSpace(inputs.shape[1])
    X = space.make_theano_batch()
    Y = model.fprop(space.format_as(X, model.get_input_space()))
    f = theano.function([X], Y)
    return f(inputs)
Пример #19
0
class MultiSoftmax(Layer):
    def __init__(self,
                 n_groups,
                 n_classes,
                 layer_name,
                 irange=None,
                 istdev=None,
                 sparse_init=None,
                 W_lr_scale=None,
                 b_lr_scale=None,
                 max_row_norm=None,
                 no_affine=False,
                 max_col_norm=None):
        """
        """

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        self.__dict__.update(locals())
        del self.self

        assert isinstance(n_classes, py_integer_types)

        self.output_space = MatrixSpace(n_groups, n_classes)
        self.b = sharedX(np.zeros((
            n_groups,
            n_classes,
        )), name='softmax_b')

    def get_lr_scalers(self):

        rval = OrderedDict()

        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        if not hasattr(self, 'b_lr_scale'):
            self.b_lr_scale = None

        if self.b_lr_scale is not None:
            assert isinstance(self.b_lr_scale, float)
            rval[self.b] = self.b_lr_scale

        return rval

    def get_monitoring_channels(self):
        return OrderedDict()

    def get_monitoring_channels_from_state(self, state, target=None):
        return OrderedDict()

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got " + str(space) + " of type " +
                            str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        if self.no_affine:
            desired_dim = self.n_classes
            assert self.input_dim == desired_dim
        else:
            desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.irange is not None:
            assert self.istdev is None
            assert self.sparse_init is None
            W = rng.uniform(-self.irange, self.irange,
                            (self.input_dim, self.n_groups, self.n_classes))
        elif self.istdev is not None:
            assert self.sparse_init is None
            W = rng.randn(self.input_dim, self.n_groups,
                          self.n_classes) * self.istdev
        else:
            raise NotImplementedError()

        self.W = sharedX(W, 'softmax_W')

        self._params = [self.b, self.W]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes,
                                         ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):
        self.W.set_value(weights)

    def set_biases(self, biases):
        self.b.set_value(biases)

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def fprop(self, state_below):

        self.input_space.validate(state_below)

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below,
                                                     self.desired_space)

        for value in get_debug_values(state_below):
            if self.mlp.batch_size is not None and value.shape[
                    0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size " +
                                 str(self.dbm.batch_size) + " but has " +
                                 str(value.shape[0]))

        self.desired_space.validate(state_below)
        assert state_below.ndim == 2

        assert self.W.ndim == 3

        Z = T.tensordot(state_below, self.W, axes=[[1], [0]]) + self.b

        rval = batched_softmax(Z)

        for value in get_debug_values(rval):
            if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size

        return rval

    def cost(self, Y, Y_hat):
        return self.cost_from_cost_matrix(self.cost_matrix(Y, Y_hat))

    def cost_from_cost_matrix(self, cost_matrix):
        return cost_matrix.sum(axis=2).mean()

    def cost_matrix(self, Y, Y_hat):
        return -Y * T.log(Y_hat + 0.000001)

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        return coeff * T.sqr(self.W).sum()

    def get_l1_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W = self.W
        return coeff * abs(W).sum()

    def censor_updates(self, updates):
        return
        if self.max_row_norm is not None:
            W = self.W
            if W in updates:
                updated_W = updates[W]
                row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1))
                desired_norms = T.clip(row_norms, 0, self.max_row_norm)
                updates[W] = updated_W * (desired_norms /
                                          (1e-7 + row_norms)).dimshuffle(
                                              0, 'x')
        if self.max_col_norm is not None:
            assert self.max_row_norm is None
            W = self.W
            if W in updates:
                updated_W = updates[W]
                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
Пример #20
0
 def fprop(self, state_below):
     vector_space = VectorSpace(self.output_space.get_total_dimension())
     X = self.output_space.format_as(state_below, vector_space)
     rval = T.dot(X - self.mean, self.P)
     rval = vector_space.format_as(rval, self.output_space)
     return rval
Пример #21
0
class BinaryVectorMaxPool(HiddenLayer):
    """
        A hidden layer that does max-pooling on binary vectors.
        It has two sublayers, the detector layer and the pooling
        layer. The detector layer is its downward state and the pooling
        layer is its upward state.

        TODO: this layer uses (pooled, detector) as its total state,
              which can be confusing when listing all the states in
              the network left to right. Change this and
              pylearn2.expr.probabilistic_max_pooling to use
              (detector, pooled)
    """
    def __init__(self,
                 detector_layer_dim,
                 pool_size,
                 layer_name,
                 irange=None,
                 sparse_init=None,
                 include_prob=1.0,
                 init_bias=0.):
        """

            include_prob: probability of including a weight element in the set
                    of weights initialized to U(-irange, irange). If not included
                    it is initialized to 0.

        """
        self.__dict__.update(locals())
        del self.self

        self.b = sharedX(np.zeros((self.detector_layer_dim, )) + init_bias,
                         name=layer_name + '_b')

    def set_input_space(self, space):
        """ Note: this resets parameters! """

        self.input_space = space

        if isinstance(space, VectorSpace):
            self.requires_reformat = False
            self.input_dim = space.dim
        else:
            self.requires_reformat = True
            self.input_dim = space.get_total_dimension()
            self.desired_space = VectorSpace(self.input_dim)

        if not (self.detector_layer_dim % self.pool_size == 0):
            raise ValueError(
                "detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d"
                % (self.detector_layer_dim, self.pool_size,
                   self.detector_layer_dim % self.pool_size))

        self.h_space = VectorSpace(self.detector_layer_dim)
        self.pool_layer_dim = self.detector_layer_dim / self.pool_size
        self.output_space = VectorSpace(self.pool_layer_dim)

        rng = self.dbm.rng
        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,
                                 self.irange,
                                 (self.input_dim, self.detector_layer_dim)) * \
                    (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim))
                     < self.include_prob)
        else:
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.detector_layer_dim))
            for i in xrange(self.detector_layer_dim):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        W = sharedX(W)
        W.name = self.layer_name + '_W'

        self.transformer = MatrixMul(W)

        W, = self.transformer.get_params()
        assert W.name is not None

    def get_total_state_space(self):
        return CompositeSpace((self.output_space, self.h_space))

    def get_params(self):
        assert self.b.name is not None
        W, = self.transformer.get_params()
        assert W.name is not None
        return self.transformer.get_params().union([self.b])

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float)
        W, = self.transformer.get_params()
        return coeff * T.sqr(W).sum()

    def get_weights(self):
        if self.requires_reformat:
            # This is not really an unimplemented case.
            # We actually don't know how to format the weights
            # in design space. We got the data in topo space
            # and we don't have access to the dataset
            raise NotImplementedError()
        W, = self.transformer.get_params()
        return W.get_value()

    def set_weights(self, weights):
        W, = self.transformer.get_params()
        W.set_value(weights)

    def set_biases(self, biases):
        self.b.set_value(biases)

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def get_weights_view_shape(self):
        total = self.detector_layer_dim
        cols = self.pool_size
        if cols == 1:
            # Let the PatchViewer decidew how to arrange the units
            # when they're not pooled
            raise NotImplementedError()
        # When they are pooled, make each pooling unit have one row
        rows = total / cols
        return rows, cols

    def get_weights_topo(self):

        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()

        W, = self.transformer.get_params()

        W = W.T

        W = W.reshape((self.detector_layer_dim, self.input_space.shape[0],
                       self.input_space.shape[1], self.input_space.nchannels))

        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))

        return function([], W)()

    def upward_state(self, total_state):
        p, h = total_state
        self.h_space.validate(h)
        self.output_space.validate(p)
        return p

    def downward_state(self, total_state):
        p, h = total_state
        return h

    def get_monitoring_channels_from_state(self, state):

        P, H = state

        rval = {}

        if self.pool_size == 1:
            vars_and_prefixes = [(P, '')]
        else:
            vars_and_prefixes = [(P, 'p_'), (H, 'h_')]

        for var, prefix in vars_and_prefixes:
            v_max = var.max(axis=0)
            v_min = var.min(axis=0)
            v_mean = var.mean(axis=0)
            v_range = v_max - v_min

            for key, val in [('max_max', v_max.max()),
                             ('max_mean', v_max.mean()),
                             ('max_min', v_max.min()),
                             ('min_max', v_min.max()),
                             ('min_mean', v_min.mean()),
                             ('min_max', v_min.max()),
                             ('range_max', v_range.max()),
                             ('range_mean', v_range.mean()),
                             ('range_min', v_range.min()),
                             ('mean_max', v_mean.max()),
                             ('mean_mean', v_mean.mean()),
                             ('mean_min', v_mean.min())]:
                rval[prefix + key] = val

        return rval

    def get_l1_act_cost(self, state, target, coeff, eps=None):
        rval = 0.

        P, H = state
        self.output_space.validate(P)
        self.h_space.validate(H)

        if self.pool_size == 1:
            # If the pool size is 1 then pools = detectors
            # and we should not penalize pools and detectors separately
            assert len(state) == 2
            assert isinstance(target, float)
            assert isinstance(coeff, float)
            _, state = state
            state = [state]
            target = [target]
            coeff = [coeff]
            if eps is None:
                eps = [0.]
            else:
                eps = [eps]
        else:
            assert all([len(elem) == 2 for elem in [state, target, coeff]])
            if eps is None:
                eps = [0., 0.]
            if target[1] < target[0]:
                warnings.warn(
                    "Do you really want to regularize the detector units to be sparser than the pooling units?"
                )

        for s, t, c, e in safe_zip(state, target, coeff, eps):
            assert all([isinstance(elem, float) for elem in [t, c, e]])
            if c == 0.:
                continue
            m = s.mean(axis=0)
            assert m.ndim == 1
            rval += T.maximum(abs(m - t) - e, 0.).mean() * c

        return rval

    def sample(self,
               state_below=None,
               state_above=None,
               layer_above=None,
               theano_rng=None):

        if theano_rng is None:
            raise ValueError(
                "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list."
            )

        if state_above is not None:
            msg = layer_above.downward_message(state_above)
        else:
            msg = None

        if self.requires_reformat:
            state_below = self.input_space.format_as(state_below,
                                                     self.desired_space)

        z = self.transformer.lmul(state_below) + self.b
        p, h, p_sample, h_sample = max_pool_channels(z, self.pool_size, msg,
                                                     theano_rng)

        return p_sample, h_sample

    def downward_message(self, downward_state):
        rval = self.transformer.lmul_T(downward_state)

        if self.requires_reformat:
            rval = self.desired_space.format_as(rval, self.input_space)

        return rval

    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.
        """

        t1 = time.time()

        empty_input = self.h_space.get_origin_batch(num_examples)
        h_state = sharedX(empty_input)

        default_z = T.zeros_like(h_state) + self.b

        theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16))

        p_exp, h_exp, p_sample, h_sample = max_pool_channels(
            z=default_z, pool_size=self.pool_size, theano_rng=theano_rng)

        assert h_sample.dtype == default_z.dtype

        p_state = sharedX(self.output_space.get_origin_batch(num_examples))

        t2 = time.time()

        f = function([], updates={p_state: p_sample, h_state: h_sample})

        t3 = time.time()

        f()

        t4 = time.time()

        print str(self) + '.make_state took', t4 - t1
        print '\tcompose time:', t2 - t1
        print '\tcompile time:', t3 - t2
        print '\texecute time:', t4 - t3

        p_state.name = 'p_sample_shared'
        h_state.name = 'h_sample_shared'

        return p_state, h_state

    def expected_energy_term(self, state, average, state_below, average_below):

        self.input_space.validate(state_below)

        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError(
                            "self.dbm.batch_size is %d but got shape of %d" %
                            (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x, y: x * y,
                                  sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below,
                                                     self.desired_space)

        downward_state = self.downward_state(state)
        self.h_space.validate(downward_state)

        # Energy function is linear so it doesn't matter if we're averaging or not
        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
        # and d is the downward state of this layer

        bias_term = T.dot(downward_state, self.b)
        weights_term = (self.transformer.lmul(state_below) *
                        downward_state).sum(axis=1)

        rval = -bias_term - weights_term

        assert rval.ndim == 1

        return rval

    def mf_update(self,
                  state_below,
                  state_above,
                  layer_above=None,
                  double_weights=False,
                  iter_name=None):

        self.input_space.validate(state_below)

        if self.requires_reformat:
            if not isinstance(state_below, tuple):
                for sb in get_debug_values(state_below):
                    if sb.shape[0] != self.dbm.batch_size:
                        raise ValueError(
                            "self.dbm.batch_size is %d but got shape of %d" %
                            (self.dbm.batch_size, sb.shape[0]))
                    assert reduce(lambda x, y: x * y,
                                  sb.shape[1:]) == self.input_dim

            state_below = self.input_space.format_as(state_below,
                                                     self.desired_space)

        if iter_name is None:
            iter_name = 'anon'

        if state_above is not None:
            assert layer_above is not None
            msg = layer_above.downward_message(state_above)
            msg.name = 'msg_from_' + layer_above.layer_name + '_to_' + self.layer_name + '[' + iter_name + ']'
        else:
            msg = None

        if double_weights:
            state_below = 2. * state_below
            state_below.name = self.layer_name + '_' + iter_name + '_2state'
        z = self.transformer.lmul(state_below) + self.b
        if self.layer_name is not None and iter_name is not None:
            z.name = self.layer_name + '_' + iter_name + '_z'
        p, h = max_pool_channels(z, self.pool_size, msg)

        p.name = self.layer_name + '_p_' + iter_name
        h.name = self.layer_name + '_h_' + iter_name

        return p, h
Пример #22
0
 def fprop(self, state_below):
     vector_space = VectorSpace(self.output_space.get_total_dimension())
     X = self.output_space.format_as(state_below, vector_space)
     rval = T.dot(X - self.mean, self.P)
     rval = vector_space.format_as(rval, self.output_space)
     return rval
Пример #23
0
class HingeLoss(Layer):

    def __init__(self, n_classes, layer_name, irange = None,
                 istdev = None,
                 sparse_init = None):

        self.__dict__.update(locals())
        del self.self

        self.output_space = VectorSpace(n_classes)
        self.b = sharedX(np.zeros((n_classes,)), name = 'hingeloss_b')

    def get_monitoring_channels(self):

        W = self.W

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        return OrderedDict([
                            ('row_norms_min'  , row_norms.min()),
                            ('row_norms_mean' , row_norms.mean()),
                            ('row_norms_max'  , row_norms.max()),
                            ('col_norms_min'  , col_norms.min()),
                            ('col_norms_mean' , col_norms.mean()),
                            ('col_norms_max'  , col_norms.max()),
                            ])

    def get_monitoring_channels_from_state(self, state, target=None):

        mx = state.max(axis=1)

        rval =  OrderedDict([
                ('mean_max_class' , mx.mean()),
                ('max_max_class' , mx.max()),
                ('min_max_class' , mx.min())
        ])

        if target is not None:
            y_hat = self.target_convert(T.argmax(state, axis=1))
            #Assume target is in [0,1] as binary one-hot
            y = self.target_convert(T.argmax(target, axis=1))
            misclass = T.neq(y, y_hat).mean()
            misclass = T.cast(misclass, config.floatX)
            rval['misclass'] = misclass
            rval['nll'] = self.cost(Y_hat=state, Y=target)

        return rval

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        if self.irange is not None:
            assert self.istdev is None
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes))
        elif self.istdev is not None:
            assert self.sparse_init is None
            W = rng.randn(self.input_dim, self.n_classes) * self.istdev
        else:
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.n_classes))
            for i in xrange(self.n_classes):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0.:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        self.W = sharedX(W,  'hingeloss_W' )

        self._params = [ self.b, self.W ]

    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):
        self.W.set_value(weights)

    def set_biases(self, biases):
        self.b.set_value(biases)

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def fprop(self, state_below):
        self.input_space.validate(state_below)

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        for value in get_debug_values(state_below):
            if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))

        self.desired_space.validate(state_below)
        assert state_below.ndim == 2

        assert self.W.ndim == 2
        b = self.b
        W = self.W

        rval = T.dot(state_below, W) + b

        for value in get_debug_values(rval):
            if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size

        return rval

    def target_convert(self, Y):
        '''
        converts target [0,1] to [-1, 1]
        '''
        Y_t = 2. * Y - 1.
        return Y_t

    def hinge_cost(self, W, Y, Y_hat, C=1.):
        #prob = .5 * T.dot(self.W.T, self.W) + C * (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1)
        prob = (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1)
        return prob

    def cost(self, Y, Y_hat):
        """
        Y must be one-hot binary. Y_hat is a hinge loss estimate.
        of Y.
        """

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert Y_hat.ndim == 2
        Y_t = self.target_convert(Y)
        prob = self.hinge_cost(self.W, Y_t, Y_hat)
        assert prob.ndim == 1
        rval = prob.mean()

        return rval


    def cost_matrix(self, Y, Y_hat):
        """
        Y must be one-hot binary. Y_hat is a hinge loss estimate.
        of Y.
        """

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op

        assert Y_hat.ndim == 2
        Y_t = self.target_convert(Y)
        prob = self.hinge_cost(self.W, Y_t, Y_hat)
        return prob


    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        return coeff * T.sqr(self.W).sum()

    def get_l1_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W = self.W
        return coeff * abs(W).sum()
Пример #24
0
class Softmax(HiddenLayer):

    def __init__(self, n_classes, layer_name, irange = None,
                 sparse_init = None, W_lr_scale = None):

        if isinstance(W_lr_scale, str):
            W_lr_scale = float(W_lr_scale)

        self.__dict__.update(locals())
        del self.self

        assert isinstance(n_classes, int)

        self.output_space = VectorSpace(n_classes)
        self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b')

    def get_lr_scalers(self):

        rval = {}

        # Patch old pickle files
        if not hasattr(self, 'W_lr_scale'):
            self.W_lr_scale = None

        if self.W_lr_scale is not None:
            assert isinstance(self.W_lr_scale, float)
            rval[self.W] = self.W_lr_scale

        return rval

    def get_total_state_space(self):
        return self.output_space

    def get_monitoring_channels_from_state(self, state):

        mx = state.max(axis=1)

        return {
                'mean_max_class' : mx.mean(),
                'max_max_class' : mx.max(),
                'min_max_class' : mx.min()
        }

    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        self.desired_space = VectorSpace(self.input_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.dbm.rng

        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes))
        else:
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.n_classes))
            for i in xrange(self.n_classes):
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0.:
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()

        self.W = sharedX(W,  'softmax_W' )

        self._params = [ self.b, self.W ]
    def get_weights_topo(self):
        if not isinstance(self.input_space, Conv2DSpace):
            raise NotImplementedError()
        desired = self.W.get_value().T
        ipt = self.desired_space.format_as(desired, self.input_space)
        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c'))
        return rval

    def get_weights(self):
        if not isinstance(self.input_space, VectorSpace):
            raise NotImplementedError()

        return self.W.get_value()

    def set_weights(self, weights):
        self.W.set_value(weights)

    def set_biases(self, biases):
        self.b.set_value(biases)

    def get_biases(self):
        return self.b.get_value()

    def get_weights_format(self):
        return ('v', 'h')

    def sample(self, state_below = None, state_above = None,
            layer_above = None,
            theano_rng = None):

        if state_above is not None:
            # If you implement this case, also add a unit test for it.
            # Or at least add a warning that it is not tested.
            raise NotImplementedError()

        if theano_rng is None:
            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")

        self.input_space.validate(state_below)

        # patch old pickle files
        if not hasattr(self, 'needs_reformat'):
            self.needs_reformat = self.needs_reshape
            del self.needs_reshape

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        self.desired_space.validate(state_below)


        z = T.dot(state_below, self.W) + self.b
        h_exp = T.nnet.softmax(z)
        h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype)

        return h_sample

    def mf_update(self, state_below, state_above = None, layer_above = None, double_weights = False, iter_name = None):
        if state_above is not None:
            raise NotImplementedError()

        if double_weights:
            raise NotImplementedError()

        self.input_space.validate(state_below)

        # patch old pickle files
        if not hasattr(self, 'needs_reformat'):
            self.needs_reformat = self.needs_reshape
            del self.needs_reshape

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        self.desired_space.validate(state_below)


        """
        from pylearn2.utils import serial
        X = serial.load('/u/goodfeli/galatea/dbm/inpaint/expdir/cifar10_N3_interm_2_features.pkl')
        state_below = Verify(X,'features')(state_below)
        """

        assert self.W.ndim == 2
        assert state_below.ndim == 2

        b = self.b

        Z = T.dot(state_below, self.W) + b

        #Z = Print('Z')(Z)

        rval = T.nnet.softmax(Z)

        return rval

    def downward_message(self, downward_state):

        rval =  T.dot(downward_state, self.W.T)

        rval = self.desired_space.format_as(rval, self.input_space)

        return rval

    def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale):
        """
            scale is because the visible layer also goes into the
            cost. it uses the mean over units and examples, so that
            the scale of the cost doesn't change too much with batch
            size or example size.
            we need to multiply this cost by scale to make sure that
            it is put on the same scale as the reconstruction cost
            for the visible units. ie, scale should be 1/nvis
        """


        Y_hat = Y_hat_unmasked
        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        z ,= owner.inputs
        assert z.ndim == 2

        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.exp(z).sum(axis=1).dimshuffle(0, 'x')
        # we use sum and not mean because this is really one variable per row
        log_prob_of = (Y * log_prob).sum(axis=1)
        masked = log_prob_of * drop_mask_Y
        assert masked.ndim == 1

        rval = masked.mean() * scale

        return - rval

    def make_state(self, num_examples, numpy_rng):
        """ Returns a shared variable containing an actual state
           (not a mean field state) for this variable.
        """

        t1 = time.time()

        empty_input = self.output_space.get_origin_batch(num_examples)
        h_state = sharedX(empty_input)

        default_z = T.zeros_like(h_state) + self.b

        theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16))

        h_exp = T.nnet.softmax(default_z)

        h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype)

        p_state = sharedX( self.output_space.get_origin_batch(
            num_examples))


        t2 = time.time()

        f = function([], updates = {
            h_state : h_sample
            })

        t3 = time.time()

        f()

        t4 = time.time()

        print str(self)+'.make_state took',t4-t1
        print '\tcompose time:',t2-t1
        print '\tcompile time:',t3-t2
        print '\texecute time:',t4-t3

        h_state.name = 'softmax_sample_shared'

        return h_state

    def get_weight_decay(self, coeff):
        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float)
        return coeff * T.sqr(self.W).sum()

    def expected_energy_term(self, state, average, state_below, average_below):

        self.input_space.validate(state_below)
        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)
        self.desired_space.validate(state_below)

        # Energy function is linear so it doesn't matter if we're averaging or not
        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
        # and d is the downward state of this layer

        bias_term = T.dot(state, self.b)
        weights_term = (T.dot(state_below, self.W) * state).sum(axis=1)

        rval = -bias_term - weights_term

        assert rval.ndim == 1

        return rval