Пример #1
0
class ProjectionLayer(Layer):
    """
    This layer can be used to project discrete labels into a continous space
    as done in e.g. language models. It takes labels as an input (IndexSpace)
    and maps them to their continous embeddings and concatenates them.

    Parameters
        ----------
    dim : int
        The dimension of the embeddings. Note that this means that the
        output dimension is (dim * number of input labels)
    layer_name : string
        Layer name
    irange : numeric
       The range of the uniform distribution used to initialize the
       embeddings. Can't be used with istdev.
    istdev : numeric
        The standard deviation of the normal distribution used to
        initialize the embeddings. Can't be used with irange.
    """
    def __init__(self, dim, layer_name, irange=None, istdev=None):
        """
        Initializes a projection layer.
        """
        super(ProjectionLayer, self).__init__()
        self.dim = dim
        self.layer_name = layer_name
        if irange is None and istdev is None:
            raise ValueError("ProjectionLayer needs either irange or"
                             "istdev in order to intitalize the projections.")
        elif irange is not None and istdev is not None:
            raise ValueError("ProjectionLayer was passed both irange and "
                             "istdev but needs only one")
        else:
            self._irange = irange
            self._istdev = istdev

    @wraps(Layer.get_monitoring_channels)
    def get_monitoring_channels(self):

        W, = self.transformer.get_params()

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        return OrderedDict([('row_norms_min',  row_norms.min()),
                            ('row_norms_mean', row_norms.mean()),
                            ('row_norms_max',  row_norms.max()),
                            ('col_norms_min',  col_norms.min()),
                            ('col_norms_mean', col_norms.mean()),
                            ('col_norms_max',  col_norms.max()), ])

    @wraps(Layer.set_input_space)
    def set_input_space(self, space):
        if isinstance(space, IndexSpace):
            self.input_dim = space.dim
            self.input_space = space
        else:
            raise ValueError("ProjectionLayer needs an IndexSpace as input")
        self.output_space = VectorSpace(self.dim * self.input_dim)
        rng = self.mlp.rng
        if self._irange is not None:
            W = rng.uniform(-self._irange,
                            self._irange,
                            (space.max_labels, self.dim))
        else:
            W = rng.randn(space.max_labels, self.dim) * self._istdev

        W = sharedX(W)
        W.name = self.layer_name + '_W'

        self.transformer = MatrixMul(W)

        W, = self.transformer.get_params()
        assert W.name is not None

    @wraps(Layer.fprop)
    def fprop(self, state_below):
        z = self.transformer.project(state_below)
        return z

    @wraps(Layer.get_params)
    def get_params(self):
        W, = self.transformer.get_params()
        assert W.name is not None
        params = [W]
        return params
Пример #2
0
class vLBLSoft(Model):
    def __init__(self, dict_size, dim, context_length, k, irange = 0.1, seed = 22):
        super(vLBLSoft, self).__init__()
        rng = np.random.RandomState(seed)
        self.rng = rng
        self.context_length = context_length
        self.dim = dim
        self.dict_size = dict_size
        C_values = np.asarray(rng.normal(0, math.sqrt(irange),
                                         size=(dim,context_length)),
                              dtype=theano.config.floatX)
        self.C = theano.shared(value=C_values, name='C', borrow=True)
        W_context = rng.uniform(-irange, irange, (dict_size, dim))
        W_context = sharedX(W_context,name='W_context')
        W_target = rng.uniform(-irange, irange, (dict_size, dim))
        W_target = sharedX(W_target,name='W_target')
        self.projector_context = MatrixMul(W_context)
        self.projector_target = MatrixMul(W_target)
        self.W_context = W_context
        self.W_target = W_target
        self.W_target = W_context
        b_values = np.asarray(rng.normal(0, math.sqrt(irange), size=(dict_size,)),
                              dtype=theano.config.floatX)
        self.b = theano.shared(value=b_values, name='b', borrow=True)
        self.input_space = IndexSpace(dim = context_length, max_labels = dict_size)
        self.output_space = IndexSpace(dim = 1, max_labels = dict_size)
        self.allY = T.as_tensor_variable(np.arange(dict_size,dtype=np.int64).reshape(dict_size,1))
    
    def get_params(self):
        #get W from projector
        rval1 = self.projector_context.get_params()
        rval2 = self.projector_target.get_params()
                #add C, b
        rval1.extend([self.C, self.b])
        rval1.extend(rval2)
        return rval1
    
    def fprop(self, state_below):
        """
        state_below is r_w?
        """
        state_below = state_below.reshape((state_below.shape[0], self.dim, self.context_length))
        rval = self.C.dimshuffle('x', 0, 1) * state_below
        rval = rval.sum(axis=2)
        return rval
    
    def get_default_cost(self):
        return Default()
    
    def get_monitoring_data_specs(self):
        """
        Returns data specs requiring both inputs and targets."""
        space = CompositeSpace((self.get_input_space(),
                                self.get_output_space()))
        source = (self.get_input_source(), self.get_target_source())
        return (space, source)
    
    def get_monitoring_channels(self, data):
        rval = OrderedDict()
        rval['nll'] = self.cost_from_X(data)
        rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32'))
        return rval
    
    def score(self, all_q_w, q_h):
        sallwh = T.dot(q_h,all_q_w.T) + self.b.dimshuffle('x',0)
        return sallwh

    def cost_from_X(self, data):
        X, Y = data
        X = self.projector_context.project(X)
        q_h = self.fprop(X)
        rval = self.cost(Y,q_h)
        return rval
    
    def cost(self,Y,q_h):
        all_q_w = self.W_target
        s = self.score(all_q_w,q_h)
        p_w_given_h = T.nnet.softmax(s)
        return T.cast(-T.mean(T.diag(T.log(p_w_given_h)[T.arange(Y.shape[0]), Y])),theano.config.floatX)

    def apply_dropout(self, state, include_prob, scale, theano_rng, input_space, mask_value=0, per_example=True):
        """
        per_example : bool, optional
            Sample a different mask value for every example in a batch.
            Defaults to `True`. If `False`, sample one mask per mini-batch.
        """
        if include_prob in [None, 1.0, 1]:
            return state
        assert scale is not None
        if isinstance(state, tuple):
            return tuple(self.apply_dropout(substate, include_prob,
                                            scale, theano_rng, mask_value)
                         for substate in state)
        if per_example:
            mask = theano_rng.binomial(p=include_prob, size=state.shape,
                                       dtype=state.dtype)
        else:
            batch = input_space.get_origin_batch(1)
            mask = theano_rng.binomial(p=include_prob, size=batch.shape,
                                       dtype=state.dtype)
            rebroadcast = T.Rebroadcast(*zip(xrange(batch.ndim),
                                             [s == 1 for s in batch.shape]))
            mask = rebroadcast(mask)
        if mask_value == 0:
            rval = state * mask * scale
        else:
            rval = T.switch(mask, state * scale, mask_value)
        return T.cast(rval, state.dtype)


    def dropout_fprop(self, state_below, default_input_include_prob=0.5,
                      input_include_probs=None, default_input_scale=2.,
                      input_scales=None, per_example=True):

        if input_include_probs is None:
            input_include_probs = {}

        if input_scales is None:
            input_scales = {}

        theano_rng = MRG_RandomStreams(max(self.rng.randint(2 ** 15), 1))

        include_prob = default_input_include_prob
        scale = default_input_scale
        state_below = self.apply_dropout(
                state=state_below,
                include_prob=include_prob,
                theano_rng=theano_rng,
                scale=scale,
                #check
                mask_value=0,
                input_space=self.get_input_space(),
                per_example=per_example
            )
        state_below = self.fprop(state_below)

        return state_below
Пример #3
0
class ProjectionLayer(Layer):
    """
    This layer can be used to project discrete labels into a continous space
    as done in e.g. language models. It takes labels as an input (IndexSpace)
    and maps them to their continous embeddings and concatenates them.

    Parameters
        ----------
    dim : int
        The dimension of the embeddings. Note that this means that the
        output dimension is (dim * number of input labels)
    layer_name : string
        Layer name
    irange : numeric
       The range of the uniform distribution used to initialize the
       embeddings. Can't be used with istdev.
    istdev : numeric
        The standard deviation of the normal distribution used to
        initialize the embeddings. Can't be used with irange.
    """
    def __init__(self, dim, layer_name, irange=None, istdev=None):
        """
        Initializes a projection layer.
        """
        super(ProjectionLayer, self).__init__()
        self.dim = dim
        self.layer_name = layer_name
        if irange is None and istdev is None:
            raise ValueError("ProjectionLayer needs either irange or"
                             "istdev in order to intitalize the projections.")
        elif irange is not None and istdev is not None:
            raise ValueError("ProjectionLayer was passed both irange and "
                             "istdev but needs only one")
        else:
            self._irange = irange
            self._istdev = istdev

    @wraps(Layer.get_layer_monitoring_channels)
    def get_layer_monitoring_channels(self, *args, **kwargs):

        W, = self.transformer.get_params()

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        return OrderedDict([
            ('row_norms_min', row_norms.min()),
            ('row_norms_mean', row_norms.mean()),
            ('row_norms_max', row_norms.max()),
            ('col_norms_min', col_norms.min()),
            ('col_norms_mean', col_norms.mean()),
            ('col_norms_max', col_norms.max()),
        ])

    def _check_input_space_and_get_max_labels(self, space):
        if isinstance(space, IndexSpace):
            return space.max_labels
        if isinstance(space, CompositeSpace):
            ml = []
            for c in space.components:
                ml.append(self._check_input_space_and_get_max_labels(c))
            # check that all of them are equal
            if len(set(ml)) != 1:
                raise ValueError("Composite space is empty or containing "
                                 "incompatible index spaces")
            return ml[0]
        raise ValueError("ProjectionLayer needs an IndexSpace or a "
                         "CompositeSpace of them as input")

    def _build_output_space(self, space):
        if isinstance(space, IndexSpace):
            return VectorSpace(self.dim * space.dim)
        if isinstance(space, CompositeSpace):
            return CompositeSpace(
                [self._build_output_space(c) for c in space.components])
        assert False

    @wraps(Layer.set_input_space)
    def set_input_space(self, space):
        max_labels = self._check_input_space_and_get_max_labels(space)
        self.input_space = space
        self.output_space = self._build_output_space(space)
        rng = self.mlp.rng
        if self._irange is not None:
            W = rng.uniform(-self._irange, self._irange,
                            (max_labels, self.dim))
        else:
            W = rng.randn(max_labels, self.dim) * self._istdev

        W = sharedX(W)
        W.name = self.layer_name + '_W'

        self.transformer = MatrixMul(W)

        W, = self.transformer.get_params()
        assert W.name is not None

    def _fprop_recursive(self, state_below):
        if isinstance(state_below, tuple):
            return tuple(self._fprop_recursive(s) for s in state_below)
        return self.transformer.project(state_below)

    @wraps(Layer.fprop)
    def fprop(self, state_below):
        return self._fprop_recursive(state_below)

    @wraps(Layer.get_params)
    def get_params(self):
        W, = self.transformer.get_params()
        assert W.name is not None
        params = [W]
        return params

    @wraps(Layer.get_weight_decay)
    def get_weight_decay(self, coeff):

        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W, = self.transformer.get_params()
        return coeff * T.sqr(W).sum()

    @wraps(Layer.get_l1_weight_decay)
    def get_l1_weight_decay(self, coeff):

        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W, = self.transformer.get_params()
        return coeff * abs(W).sum()
Пример #4
0
class vLBL(Model):

    def __init__(self, dict_size, dim, context_length, k, irange = 0.1, seed = 22,  max_row_norm=None,max_col_norm=None):

        rng = np.random.RandomState(seed)
        self.rng = rng
        self.k = k
        self.context_length = context_length
        self.dim = dim
        self.dict_size = dict_size
        C = rng.randn(dim, context_length)
        self.C = sharedX(C)

        W = rng.uniform(-irange, irange, (dict_size, dim))
        W = sharedX(W)
        self.W=W
        # TODO maybe have another projector for tagets
        self.projector = MatrixMul(W)

        self.b = sharedX(np.zeros((dict_size,)), name = 'vLBL_b')

        self.input_space = IndexSpace(dim = context_length, max_labels = dict_size)
        #self.output_space = IndexSpace(dim = 1, max_labels = dict_size)
        self.output_space = VectorSpace(dim = dict_size)
        self.max_row_norm=max_row_norm
        self.max_col_norm=max_col_norm
    def get_params(self):

        rval = self.projector.get_params()
        rval.extend([self.C, self.b])
        return rval

    def get_default_cost(self):
        return Default()


    def fprop(self, state_below):
        "q^(h) from EQ. 2"

        state_below = state_below.reshape((state_below.shape[0], self.dim, self.context_length))
        rval = self.C.dimshuffle('x', 0, 1) * state_below
        rval = rval.sum(axis=2)
        return rval

    def _modify_updates(self,updates):
        #for param in self.get_params():
        if self.max_row_norm is not None:
            W = self.W
            if W in updates:
                updated_W = updates[W]
                row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1))
                desired_norms = T.clip(row_norms, 0, self.max_row_norm)
                scales = desired_norms / (1e-7 + row_norms)
                updates[W] = updated_W * scales.dimshuffle(0, 'x')
        if self.max_col_norm is not None:
            assert self.max_row_norm is None
            W = self.W
            if W in updates:
                updated_W = updates[W]
                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))

    def score(self,q_h):
        q_w = self.projector._W
        rval = T.dot(q_h, q_w.T) + self.b.dimshuffle('x', 0)
        return rval

    def cost(self,Y,q_h):
        z = self.score(q_h)
        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x'))
        log_prob_of = (Y * log_prob).sum(axis=1)
        assert log_prob_of.ndim == 1
        rval = as_floatX(log_prob_of.mean())
        return - rval

    def cost_from_X(self, data):
        X, Y = data
        X = self.projector.project(X)
        q_h = self.fprop(X)
        return self.cost(Y,q_h)


    def get_monitoring_data_specs(self):

        space = CompositeSpace((self.get_input_space(),
                                self.get_output_space()))
        source = (self.get_input_source(), self.get_target_source())
        return (space, source)

    def get_monitoring_channels(self, data):
        X, Y = data
        rval = OrderedDict()
        
        W_context = self.W
        W_target = self.W
        b = self.b
        C = self.C

        sq_W_context = T.sqr(W_context)
        # sq_W_target = T.sqr(W_target)
        sq_b = T.sqr(b)
        sq_c = T.sqr(C)

        row_norms_W_context = T.sqrt(sq_W_context.sum(axis=1))
        col_norms_W_context = T.sqrt(sq_W_context.sum(axis=0))

        # row_norms_W_target = T.sqrt(sq_W_target.sum(axis=1))
        # col_norms_W_target = T.sqrt(sq_W_target.sum(axis=0))
        
        col_norms_b = T.sqrt(sq_b.sum(axis=0))

        
        col_norms_c = T.sqrt(sq_c.sum(axis=0))

        rval = OrderedDict([
                            ('W_context_row_norms_min'  , row_norms_W_context.min()),
                            ('W_context_row_norms_mean' , row_norms_W_context.mean()),
                            ('W_context_row_norms_max'  , row_norms_W_context.max()),
                            ('W_context_col_norms_min'  , col_norms_W_context.min()),
                            ('W_context_col_norms_mean' , col_norms_W_context.mean()),
                            ('W_context_col_norms_max'  , col_norms_W_context.max()),

                            # ('W_target_row_norms_min'  , row_norms_W_target.min()),
                            # ('W_target_row_norms_mean' , row_norms_W_target.mean()),
                            # ('W_target_row_norms_max'  , row_norms_W_target.max()),
                            # ('W_target_col_norms_min'  , col_norms_W_target.min()),
                            # ('W_target_col_norms_mean' , col_norms_W_target.mean()),
                            # ('W_target_col_norms_max'  , col_norms_W_target.max()),
                            
                            ('b_col_norms_min'  , col_norms_b.min()),
                            ('b_col_norms_mean' , col_norms_b.mean()),
                            ('b_col_norms_max'  , col_norms_b.max()),

                            ('c_col_norms_min'  , col_norms_c.min()),
                            ('c_col_norms_mean' , col_norms_c.mean()),
                            ('c_col_norms_max'  , col_norms_c.max()),
                            ])
            
        nll = self.cost_from_X(data)
        
        rval['perplexity'] = as_floatX(10 ** (nll/np.log(10)))
        return rval


    def apply_dropout(self, state, include_prob, scale, theano_rng, input_space, mask_value=0, per_example=True):
        """
        per_example : bool, optional
            Sample a different mask value for every example in a batch.
            Defaults to `True`. If `False`, sample one mask per mini-batch.
        """
        if include_prob in [None, 1.0, 1]:
            return state
        assert scale is not None
        if isinstance(state, tuple):
            return tuple(self.apply_dropout(substate, include_prob,
                                            scale, theano_rng, mask_value)
                         for substate in state)
        if per_example:
            mask = theano_rng.binomial(p=include_prob, size=state.shape,
                                       dtype=state.dtype)
        else:
            batch = input_space.get_origin_batch(1)
            mask = theano_rng.binomial(p=include_prob, size=batch.shape,
                                       dtype=state.dtype)
            rebroadcast = T.Rebroadcast(*zip(xrange(batch.ndim),
                                             [s == 1 for s in batch.shape]))
            mask = rebroadcast(mask)
        if mask_value == 0:
            rval = state * mask * scale
        else:
            rval = T.switch(mask, state * scale, mask_value)
        return T.cast(rval, state.dtype)


    def dropout_fprop(self, state_below, default_input_include_prob=0.5,
                      input_include_probs=None, default_input_scale=2.,
                      input_scales=None, per_example=True):

        if input_include_probs is None:
            input_include_probs = {}

        if input_scales is None:
            input_scales = {}

        theano_rng = MRG_RandomStreams(max(self.rng.randint(2 ** 15), 1))

        include_prob = default_input_include_prob
        scale = default_input_scale
        state_below = self.apply_dropout(
                state=state_below,
                include_prob=include_prob,
                theano_rng=theano_rng,
                scale=scale,
                #check
                mask_value=0,
                input_space=self.get_input_space(),
                per_example=per_example
            )
        state_below = self.fprop(state_below)

        return state_below
Пример #5
0
class vLBL(Model):

    def __init__(self, dict_size, dim, context_length, k, irange = 0.1, seed = 22):

        rng = np.random.RandomState(seed)
        self.rng = rng
        self.k = k
        self.context_length = context_length
        self.dim = dim
        self.dict_size = dict_size
        C = rng.randn(dim, context_length)
        self.C = sharedX(C)

        W = rng.uniform(-irange, irange, (dict_size, dim))
        W = sharedX(W)

        # TODO maybe have another projector for tagets
        self.projector = MatrixMul(W)

        self.b = sharedX(np.zeros((dict_size,)), name = 'vLBL_b')

        self.set_spaces()

        self.rng = np.random.RandomState(2014)


    def set_spaces(self):
        self.input_space = IndexSpace(dim=self.context_length, max_labels=self.dict_size)
        self.output_space = VectorSpace(dim=self.dict_size)

    def get_params(self):

        rval = self.projector.get_params()
        rval.extend([self.C, self.b])
        return rval


    def context(self, state_below):
        "q^(h) from EQ. 2"

        state_below = state_below.reshape((state_below.shape[0], self.dim, self.context_length))
        rval = self.C.dimshuffle('x', 0, 1) * state_below
        rval = rval.sum(axis=2)

        return rval


    def score(self, X, Y=None):
        X = self.projector.project(X)
        q_h = self.context(X)
        # this is used during training
        if Y is not None:
            q_w = self.projector.project(Y).reshape((Y.shape[0], self.dim))
            rval = (q_w * q_h).sum(axis=1) + self.b[Y].flatten()
        # during nll
        else:
            q_w = self.projector._W
            rval = T.dot(q_h, q_w.T) + self.b.dimshuffle('x', 0)

        return rval


    def cost_from_X(self, data):
        X, Y = data
        z = self.score(X)
        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x'))
        log_prob_of = (Y * log_prob).sum(axis=1)
        assert log_prob_of.ndim == 1
        rval = as_floatX(log_prob_of.mean())
        return - rval


    def get_monitoring_data_specs(self):

        space = CompositeSpace((self.get_input_space(),
                                self.get_output_space()))
        source = (self.get_input_source(), self.get_target_source())
        return (space, source)

    def get_monitoring_channels(self, data):
        X, Y = data
        rval = OrderedDict()

        nll = self.cost_from_X(data)
        rval['perplexity'] = as_floatX(10 ** (nll/np.log(10)))
        return rval
Пример #6
0
class ProjectionLayer(Layer):
    """
    This layer can be used to project discrete labels into a continous space
    as done in e.g. language models. It takes labels as an input (IndexSpace)
    and maps them to their continous embeddings and concatenates them.

    Parameters
        ----------
    dim : int
        The dimension of the embeddings. Note that this means that the
        output dimension is (dim * number of input labels)
    layer_name : string
        Layer name
    irange : numeric
       The range of the uniform distribution used to initialize the
       embeddings. Can't be used with istdev.
    istdev : numeric
        The standard deviation of the normal distribution used to
        initialize the embeddings. Can't be used with irange.
    """
    def __init__(self, dim, layer_name, irange=None, istdev=None):
        """
        Initializes a projection layer.
        """
        super(ProjectionLayer, self).__init__()
        self.dim = dim
        self.layer_name = layer_name
        if irange is None and istdev is None:
            raise ValueError("ProjectionLayer needs either irange or"
                             "istdev in order to intitalize the projections.")
        elif irange is not None and istdev is not None:
            raise ValueError("ProjectionLayer was passed both irange and "
                             "istdev but needs only one")
        else:
            self._irange = irange
            self._istdev = istdev

    @wraps(Layer.get_monitoring_channels)
    def get_monitoring_channels(self):

        W, = self.transformer.get_params()

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        return OrderedDict([
            ('row_norms_min', row_norms.min()),
            ('row_norms_mean', row_norms.mean()),
            ('row_norms_max', row_norms.max()),
            ('col_norms_min', col_norms.min()),
            ('col_norms_mean', col_norms.mean()),
            ('col_norms_max', col_norms.max()),
        ])

    @wraps(Layer.set_input_space)
    def set_input_space(self, space):
        if isinstance(space, IndexSpace):
            self.input_dim = space.dim
            self.input_space = space
        else:
            raise ValueError("ProjectionLayer needs an IndexSpace as input")
        self.output_space = VectorSpace(self.dim * self.input_dim)
        rng = self.mlp.rng
        if self._irange is not None:
            W = rng.uniform(-self._irange, self._irange,
                            (space.max_labels, self.dim))
        else:
            W = rng.randn(space.max_labels, self.dim) * self._istdev

        W = sharedX(W)
        W.name = self.layer_name + '_W'

        self.transformer = MatrixMul(W)

        W, = self.transformer.get_params()
        assert W.name is not None

    @wraps(Layer.fprop)
    def fprop(self, state_below):
        z = self.transformer.project(state_below)
        return z

    @wraps(Layer.get_params)
    def get_params(self):
        W, = self.transformer.get_params()
        assert W.name is not None
        params = [W]
        return params
Пример #7
0
class vLBLSoft(Model):
    def __init__(self, dict_size, dim, context_length, k, irange=0.1, seed=22):
        super(vLBLSoft, self).__init__()
        rng = np.random.RandomState(seed)
        self.rng = rng
        self.context_length = context_length
        self.dim = dim
        self.dict_size = dict_size
        C_values = np.asarray(rng.normal(0, math.sqrt(irange), size=(dim, context_length)), dtype=theano.config.floatX)
        self.C = theano.shared(value=C_values, name="C", borrow=True)
        W_context = rng.uniform(-irange, irange, (dict_size, dim))
        W_context = sharedX(W_context, name="W_context")
        W_target = rng.uniform(-irange, irange, (dict_size, dim))
        W_target = sharedX(W_target, name="W_target")
        self.projector_context = MatrixMul(W_context)
        self.projector_target = MatrixMul(W_target)
        self.W_context = W_context
        self.W_target = W_target
        self.W_target = W_context
        b_values = np.asarray(rng.normal(0, math.sqrt(irange), size=(dict_size,)), dtype=theano.config.floatX)
        self.b = theano.shared(value=b_values, name="b", borrow=True)
        self.input_space = IndexSpace(dim=context_length, max_labels=dict_size)
        self.output_space = IndexSpace(dim=1, max_labels=dict_size)
        self.allY = T.as_tensor_variable(np.arange(dict_size, dtype=np.int64).reshape(dict_size, 1))

    def get_params(self):
        # get W from projector
        rval1 = self.projector_context.get_params()
        rval2 = self.projector_target.get_params()
        # add C, b
        rval1.extend([self.C, self.b])
        rval1.extend(rval2)
        return rval1

    def fprop(self, state_below):
        """
        state_below is r_w?
        """
        state_below = state_below.reshape((state_below.shape[0], self.dim, self.context_length))
        rval = self.C.dimshuffle("x", 0, 1) * state_below
        rval = rval.sum(axis=2)
        return rval

    def get_default_cost(self):
        return Default()

    def get_monitoring_data_specs(self):
        """
        Returns data specs requiring both inputs and targets.

        Returns
        -------
        data_specs: TODO
            The data specifications for both inputs and targets.
        """
        space = CompositeSpace((self.get_input_space(), self.get_output_space()))
        source = (self.get_input_source(), self.get_target_source())
        return (space, source)

    def get_monitoring_channels(self, data):
        rval = OrderedDict()
        # W_context = self.W_context
        # W_target = self.W_target
        # b = self.b
        # C = self.C

        # sq_W_context = T.sqr(W_context)
        # sq_W_target = T.sqr(W_target)
        # sq_b = T.sqr(b)
        # sq_c = T.sqr(C)

        # row_norms_W_context = T.sqrt(sq_W_context.sum(axis=1))
        # col_norms_W_context = T.sqrt(sq_W_context.sum(axis=0))

        # row_norms_W_target = T.sqrt(sq_W_target.sum(axis=1))
        # col_norms_W_target = T.sqrt(sq_W_target.sum(axis=0))

        # col_norms_b = T.sqrt(sq_b.sum(axis=0))

        # col_norms_c = T.sqrt(sq_c.sum(axis=0))

        # rval = OrderedDict([
        #                     ('W_context_row_norms_min'  , row_norms_W_context.min()),
        #                     ('W_context_row_norms_mean' , row_norms_W_context.mean()),
        #                     ('W_context_row_norms_max'  , row_norms_W_context.max()),
        #                     ('W_context_col_norms_min'  , col_norms_W_context.min()),
        #                     ('W_context_col_norms_mean' , col_norms_W_context.mean()),
        #                     ('W_context_col_norms_max'  , col_norms_W_context.max()),

        #                     ('W_target_row_norms_min'  , row_norms_W_target.min()),
        #                     ('W_target_row_norms_mean' , row_norms_W_target.mean()),
        #                     ('W_target_row_norms_max'  , row_norms_W_target.max()),
        #                     ('W_target_col_norms_min'  , col_norms_W_target.min()),
        #                     ('W_target_col_norms_mean' , col_norms_W_target.mean()),
        #                     ('W_target_col_norms_max'  , col_norms_W_target.max()),

        #                     ('b_col_norms_min'  , col_norms_b.min()),
        #                     ('b_col_norms_mean' , col_norms_b.mean()),
        #                     ('b_col_norms_max'  , col_norms_b.max()),

        #                     ('c_col_norms_min'  , col_norms_c.min()),
        #                     ('c_col_norms_mean' , col_norms_c.mean()),
        #                     ('c_col_norms_max'  , col_norms_c.max()),
        #                     ])

        rval["nll"] = self.cost_from_X(data)
        rval["perplexity"] = 10 ** (rval["nll"] / np.log(10).astype("float32"))
        return rval

    def score(self, all_q_w, q_h):
        sallwh = T.dot(q_h, all_q_w.T) + self.b.dimshuffle("x", 0)
        return sallwh

    def cost_from_X(self, data):
        X, Y = data
        X = self.projector_context.project(X)
        q_h = self.fprop(X)

        rval = self.cost(Y, q_h)
        return rval

    def cost(self, Y, q_h):
        all_q_w = self.projector_target.project(self.allY)
        s = self.score(all_q_w, q_h)
        p_w_given_h = T.nnet.softmax(s)
        return T.cast(-T.mean(T.diag(T.log(p_w_given_h)[T.arange(Y.shape[0]), Y])), theano.config.floatX)

    def apply_dropout(self, state, include_prob, scale, theano_rng, input_space, mask_value=0, per_example=True):
        """
        Parameters
        ----------
        state: WRITEME
        include_prob : WRITEME
        scale : WRITEME
        theano_rng : WRITEME
        input_space : WRITEME
        mask_value : WRITEME
        per_example : bool, optional
            Sample a different mask value for every example in a batch.
            Defaults to `True`. If `False`, sample one mask per mini-batch.
        """
        if include_prob in [None, 1.0, 1]:
            return state
        assert scale is not None
        if isinstance(state, tuple):
            return tuple(
                self.apply_dropout(substate, include_prob, scale, theano_rng, mask_value) for substate in state
            )
        # TODO: all of this assumes that if it's not a tuple, it's
        # a dense tensor. It hasn't been tested with sparse types.
        # A method to format the mask (or any other values) as
        # the given symbolic type should be added to the Spaces
        # interface.
        if per_example:
            mask = theano_rng.binomial(p=include_prob, size=state.shape, dtype=state.dtype)
        else:
            batch = input_space.get_origin_batch(1)
            mask = theano_rng.binomial(p=include_prob, size=batch.shape, dtype=state.dtype)
            rebroadcast = T.Rebroadcast(*zip(xrange(batch.ndim), [s == 1 for s in batch.shape]))
            mask = rebroadcast(mask)
        if mask_value == 0:
            rval = state * mask * scale
        else:
            rval = T.switch(mask, state * scale, mask_value)

        return T.cast(rval, state.dtype)

    def dropout_fprop(
        self,
        state_below,
        default_input_include_prob=0.5,
        input_include_probs=None,
        default_input_scale=2.0,
        input_scales=None,
        per_example=True,
    ):
        """
        Returns the output of the MLP, when applying dropout to the input and
        intermediate layers.


        Parameters
        ----------
        state_below : WRITEME
            The input to the MLP
        default_input_include_prob : WRITEME
        input_include_probs : WRITEME
        default_input_scale : WRITEME
        input_scales : WRITEME
        per_example : bool, optional
            Sample a different mask value for every example in a batch.
            Defaults to `True`. If `False`, sample one mask per mini-batch.


        Notes
        -----
        Each input to each layer is randomly included or
        excluded for each example. The probability of inclusion is independent
        for each input and each example. Each layer uses
        `default_input_include_prob` unless that layer's name appears as a key
        in input_include_probs, in which case the input inclusion probability
        is given by the corresponding value.

        Each feature is also multiplied by a scale factor. The scale factor for
        each layer's input scale is determined by the same scheme as the input
        probabilities.
        """

        warnings.warn(
            "dropout doesn't use fixed_var_descr so it won't work "
            "with algorithms that make more than one theano "
            "function call per batch, such as BGD. Implementing "
            "fixed_var descr could increase the memory usage "
            "though."
        )

        if input_include_probs is None:
            input_include_probs = {}

        if input_scales is None:
            input_scales = {}

        # self._validate_layer_names(list(input_include_probs.keys()))
        # self._validate_layer_names(list(input_scales.keys()))

        theano_rng = MRG_RandomStreams(max(self.rng.randint(2 ** 15), 1))

        # for layer in self.layers:
        #    layer_name = layer.layer_name

        #    if layer_name in input_include_probs:
        #        include_prob = input_include_probs[layer_name]
        #    else:
        include_prob = default_input_include_prob

        # if layer_name in input_scales:
        #    scale = input_scales[layer_name]
        # else:
        scale = default_input_scale
        state_below = self.apply_dropout(
            state=state_below,
            include_prob=include_prob,
            theano_rng=theano_rng,
            scale=scale,
            # check
            mask_value=0,
            input_space=self.get_input_space(),
            per_example=per_example,
        )
        state_below = self.fprop(state_below)

        return state_below
Пример #8
0
class ProjectionLayer(Layer):
    """
    This layer can be used to project discrete labels into a continous space
    as done in e.g. language models. It takes labels as an input (IndexSpace)
    and maps them to their continous embeddings and concatenates them.

    Parameters
        ----------
    dim : int
        The dimension of the embeddings. Note that this means that the
        output dimension is (dim * number of input labels)
    layer_name : string
        Layer name
    irange : numeric
       The range of the uniform distribution used to initialize the
       embeddings. Can't be used with istdev.
    istdev : numeric
        The standard deviation of the normal distribution used to
        initialize the embeddings. Can't be used with irange.
    """
    def __init__(self, dim, layer_name, irange=None, istdev=None):
        """
        Initializes a projection layer.
        """
        super(ProjectionLayer, self).__init__()
        self.dim = dim
        self.layer_name = layer_name
        if irange is None and istdev is None:
            raise ValueError("ProjectionLayer needs either irange or"
                             "istdev in order to intitalize the projections.")
        elif irange is not None and istdev is not None:
            raise ValueError("ProjectionLayer was passed both irange and "
                             "istdev but needs only one")
        else:
            self._irange = irange
            self._istdev = istdev

    @wraps(Layer.get_layer_monitoring_channels)
    def get_layer_monitoring_channels(self, *args, **kwargs):

        W, = self.transformer.get_params()

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        return OrderedDict([('row_norms_min',  row_norms.min()),
                            ('row_norms_mean', row_norms.mean()),
                            ('row_norms_max',  row_norms.max()),
                            ('col_norms_min',  col_norms.min()),
                            ('col_norms_mean', col_norms.mean()),
                            ('col_norms_max',  col_norms.max()), ])

    def _check_input_space_and_get_max_labels(self, space):
        if isinstance(space, IndexSpace):
            return space.max_labels
        if isinstance(space, CompositeSpace):
            ml = []
            for c in space.components:
                ml.append(self._check_input_space_and_get_max_labels(c))
            # check that all of them are equal
            if len(set(ml)) != 1:
                raise ValueError("Composite space is empty or containing "
                                 "incompatible index spaces")
            return ml[0]
        raise ValueError("ProjectionLayer needs an IndexSpace or a "
                         "CompositeSpace of them as input")

    def _build_output_space(self, space):
        if isinstance(space, IndexSpace):
            return VectorSpace(self.dim * space.dim)
        if isinstance(space, CompositeSpace):
            return CompositeSpace([self._build_output_space(c)
                                   for c in space.components])
        assert False

    @wraps(Layer.set_input_space)
    def set_input_space(self, space):
        max_labels = self._check_input_space_and_get_max_labels(space)
        self.input_space = space
        self.output_space = self._build_output_space(space)
        rng = self.mlp.rng
        if self._irange is not None:
            W = rng.uniform(-self._irange,
                            self._irange,
                            (max_labels, self.dim))
        else:
            W = rng.randn(max_labels, self.dim) * self._istdev

        W = sharedX(W)
        W.name = self.layer_name + '_W'

        self.transformer = MatrixMul(W)

        W, = self.transformer.get_params()
        assert W.name is not None

    def _fprop_recursive(self, state_below):
        if isinstance(state_below, tuple):
            return tuple(self._fprop_recursive(s) for s in state_below)
        return self.transformer.project(state_below)

    @wraps(Layer.fprop)
    def fprop(self, state_below):
        return self._fprop_recursive(state_below)

    @wraps(Layer.get_params)
    def get_params(self):
        W, = self.transformer.get_params()
        assert W.name is not None
        params = [W]
        return params

    @wraps(Layer.get_weight_decay)
    def get_weight_decay(self, coeff):

        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W, = self.transformer.get_params()
        return coeff * T.sqr(W).sum()

    @wraps(Layer.get_l1_weight_decay)
    def get_l1_weight_decay(self, coeff):

        if isinstance(coeff, str):
            coeff = float(coeff)
        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
        W, = self.transformer.get_params()
        return coeff * abs(W).sum()