Exemplo n.º 1
0
    def __init__(self, emb_dim, dim, num_input_words, 
                 num_output_words, vocab, 
                 **kwargs):
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if num_output_words == 0:
            num_output_words = vocab.size()

        self._num_input_words = num_input_words
        self._num_output_words = num_output_words
        self._vocab = vocab

        self._word_to_id = WordToIdOp(self._vocab)

        children = []

        self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup')
        self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
        self._encoder_rnn = LSTM(dim, name='encoder_rnn')
        self._decoder_fork = Linear(emb_dim, 4 * dim, name='decoder_fork')
        self._decoder_rnn = LSTM(dim, name='decoder_rnn')
        children.extend([self._main_lookup,
                         self._encoder_fork, self._encoder_rnn,
                         self._decoder_fork, self._decoder_rnn])
        self._pre_softmax = Linear(dim, self._num_output_words)
        self._softmax = NDimensionalSoftmax()
        children.extend([self._pre_softmax, self._softmax])

        super(LanguageModel, self).__init__(children=children, **kwargs)
Exemplo n.º 2
0
def softmax_layer(h, y, x_mask, y_mask, lens, vocab_size, hidden_size,
                  boosting):
    hidden_to_output = Linear(name='hidden_to_output',
                              input_dim=hidden_size,
                              output_dim=vocab_size)
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(h)
    linear_output.name = 'linear_output'
    softmax = NDimensionalSoftmax()

    #y_hat = softmax.apply(linear_output, extra_ndim=1)
    #y_hat.name = 'y_hat'
    cost_a = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1)
    #produces correct average
    cost_a = cost_a * y_mask

    if boosting:
        #boosting step, must divide by length here
        lensMat = T.tile(lens, (y.shape[0], 1))
        cost_a = cost_a / lensMat

    #only count cost of correctly masked entries
    cost = cost_a.sum() / y_mask.sum()

    cost.name = 'cost'

    return (linear_output, cost)
Exemplo n.º 3
0
class ShallowFusionReadout(Readout):
    def __init__(self, lm_costs_name, lm_weight,
                 normalize_am_weights=False,
                 normalize_lm_weights=False,
                 normalize_tot_weights=True,
                 am_beta=1.0,
                 **kwargs):
        super(ShallowFusionReadout, self).__init__(**kwargs)
        self.lm_costs_name = lm_costs_name
        self.lm_weight = lm_weight
        self.normalize_am_weights = normalize_am_weights
        self.normalize_lm_weights = normalize_lm_weights
        self.normalize_tot_weights = normalize_tot_weights
        self.am_beta = am_beta
        self.softmax = NDimensionalSoftmax()
        self.children += [self.softmax]

    @application
    def readout(self, **kwargs):
        lm_costs = -kwargs.pop(self.lm_costs_name)
        if self.normalize_lm_weights:
            lm_costs = self.softmax.log_probabilities(
                lm_costs, extra_ndim=lm_costs.ndim - 2)
        am_pre_softmax = self.am_beta * super(ShallowFusionReadout, self).readout(**kwargs)
        if self.normalize_am_weights:
            am_pre_softmax = self.softmax.log_probabilities(
                am_pre_softmax, extra_ndim=am_pre_softmax.ndim - 2)
        x = am_pre_softmax + self.lm_weight * lm_costs
        if self.normalize_tot_weights:
            x = self.softmax.log_probabilities(x, extra_ndim=x.ndim - 2)
        return x
class ShallowFusionReadout(Readout):
    def __init__(self,
                 lm_costs_name,
                 lm_weight,
                 normalize_am_weights=False,
                 normalize_lm_weights=False,
                 normalize_tot_weights=True,
                 am_beta=1.0,
                 **kwargs):
        super(ShallowFusionReadout, self).__init__(**kwargs)
        self.lm_costs_name = lm_costs_name
        self.lm_weight = lm_weight
        self.normalize_am_weights = normalize_am_weights
        self.normalize_lm_weights = normalize_lm_weights
        self.normalize_tot_weights = normalize_tot_weights
        self.am_beta = am_beta
        self.softmax = NDimensionalSoftmax()
        self.children += [self.softmax]

    @application
    def readout(self, **kwargs):
        lm_costs = -kwargs.pop(self.lm_costs_name)
        if self.normalize_lm_weights:
            lm_costs = self.softmax.log_probabilities(
                lm_costs, extra_ndim=lm_costs.ndim - 2)
        am_pre_softmax = self.am_beta * super(ShallowFusionReadout,
                                              self).readout(**kwargs)
        if self.normalize_am_weights:
            am_pre_softmax = self.softmax.log_probabilities(
                am_pre_softmax, extra_ndim=am_pre_softmax.ndim - 2)
        x = am_pre_softmax + self.lm_weight * lm_costs
        if self.normalize_tot_weights:
            x = self.softmax.log_probabilities(x, extra_ndim=x.ndim - 2)
        return x
Exemplo n.º 5
0
def create_rnn(hidden_dim, vocab_dim,mode="rnn"):
    # input
    x = tensor.imatrix('inchar')
    y = tensor.imatrix('outchar')

    # 
    W = LookupTable(
        name = "W1",
        #dim = hidden_dim*4,
        dim = hidden_dim,
        length = vocab_dim,
        weights_init = initialization.IsotropicGaussian(0.01),
        biases_init = initialization.Constant(0)
    )
    if mode == "lstm":
        # Long Short Term Memory
        H = LSTM(
            hidden_dim, 
            name = 'H',
            weights_init = initialization.IsotropicGaussian(0.01),
            biases_init = initialization.Constant(0.0)
        )
    else:
        # recurrent history weight
        H = SimpleRecurrent(
            name = "H",
            dim = hidden_dim,
            activation = Tanh(),
            weights_init = initialization.IsotropicGaussian(0.01)
        )
    # 
    S = Linear(
        name = "W2",
        input_dim = hidden_dim,
        output_dim = vocab_dim,
        weights_init = initialization.IsotropicGaussian(0.01),
        biases_init = initialization.Constant(0)
    )

    A = NDimensionalSoftmax(
        name = "softmax"
    )

    initLayers([W,H,S])
    activations = W.apply(x)
    hiddens = H.apply(activations)#[0]
    activations2 = S.apply(hiddens)
    y_hat = A.apply(activations2, extra_ndim=1)
    cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean()

    cg = ComputationGraph(cost)
    #print VariableFilter(roles=[WEIGHT])(cg.variables)
    #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

    layers = (x, W, H, S, A, y)

    return  cg, layers, y_hat, cost
Exemplo n.º 6
0
def softmax_layer(h, y, frame_length, hidden_size):
    hidden_to_output = Linear(name="hidden_to_output", input_dim=hidden_size, output_dim=frame_length)
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(h)
    linear_output.name = "linear_output"
    softmax = NDimensionalSoftmax()
    y_hat = softmax.apply(linear_output, extra_ndim=1)
    y_hat.name = "y_hat"
    cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean()
    cost.name = "cost"
    return y_hat, cost
class NewSoftmaxEmitter(AbstractEmitter, Initializable, Random):
    """A softmax emitter for the case of integer outputs.
    Interprets readout elements as energies corresponding to their indices.
    Parameters
    ----------
    initial_output : int or a scalar :class:`~theano.Variable`
        The initial output.
    """
    def __init__(self, initial_output=0, **kwargs):
        super(NewSoftmaxEmitter, self).__init__(**kwargs)
        self.initial_output = initial_output
        self.softmax = NDimensionalSoftmax()
        self.children = [self.softmax]
        self.name = 'newbidirectional'

    @application
    def probs(self, readouts):
        return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2)

    @application
    def emitProbs(self, readouts):
        probs = self.probs(readouts)
        batch_size = probs.shape[0]
        self.pvals_flat = probs.reshape((batch_size, -1))
        generated = self.theano_rng.multinomial(pvals=self.pvals_flat)
        return self.pvals_flat

    @application
    def emit(self, readouts):
        probs = self.probs(readouts)
        batch_size = probs.shape[0]
        self.pvals_flat = probs.reshape((batch_size, -1))
        generated = self.theano_rng.multinomial(pvals=self.pvals_flat)
        winning_index = generated.reshape(probs.shape).argmax(axis=-1)
        return winning_index, self.pvals_flat[0][winning_index]

    @application
    def cost(self, readouts, outputs):
        # WARNING: unfortunately this application method works
        # just fine when `readouts` and `outputs` have
        # different dimensions. Be careful!
        return self.softmax.categorical_cross_entropy(
            outputs, readouts, extra_ndim=readouts.ndim - 2)

    @application
    def initial_outputs(self, batch_size):
        return self.initial_output * tensor.ones((batch_size, ), dtype='int64')

    def get_dim(self, name):
        if name == 'outputs':
            return 0
        return super(SoftmaxEmitter, self).get_dim(name)
Exemplo n.º 8
0
def softmax_layer(h, y, vocab_size, hidden_size):
    hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size,
                              output_dim=vocab_size)
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(h)
    linear_output.name = 'linear_output'
    softmax = NDimensionalSoftmax()
    y_hat = softmax.apply(linear_output, extra_ndim=1)
    y_hat.name = 'y_hat'
    cost = softmax.categorical_cross_entropy(
        y, linear_output, extra_ndim=1).mean()
    cost.name = 'cost'
    return y_hat, cost
Exemplo n.º 9
0
def softmax_layer(h, y, vocab_size, hidden_size):
    hidden_to_output = Linear(name='hidden_to_output',
                              input_dim=hidden_size,
                              output_dim=vocab_size)
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(h)
    linear_output.name = 'linear_output'
    softmax = NDimensionalSoftmax()
    y_hat = softmax.apply(linear_output, extra_ndim=1)
    y_hat.name = 'y_hat'
    cost = softmax.categorical_cross_entropy(y, linear_output,
                                             extra_ndim=1).mean()
    cost.name = 'cost'
    return y_hat, cost
class SoftmaxEmitter(AbstractEmitter, Initializable, Random):
    """A softmax emitter for the case of integer outputs.

    Interprets readout elements as energies corresponding to their indices.

    Parameters
    ----------
    initial_output : int or a scalar :class:`~theano.Variable`
        The initial output.

    """
    def __init__(self, initial_output=0, **kwargs):
        super(SoftmaxEmitter, self).__init__(**kwargs)
        self.initial_output = initial_output
        self.softmax = NDimensionalSoftmax()
        self.children = [self.softmax]

    @application
    def probs(self, readouts):
        return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2)

    @application
    def emit(self, readouts):
        probs = self.probs(readouts)
        batch_size = probs.shape[0]
        pvals_flat = probs.reshape((batch_size, -1))
        generated = self.theano_rng.multinomial(pvals=pvals_flat)
        return generated.reshape(probs.shape).argmax(axis=-1)

    @application
    def cost(self, readouts, outputs):
        # WARNING: unfortunately this application method works
        # just fine when `readouts` and `outputs` have
        # different dimensions. Be careful!
        return self.softmax.categorical_cross_entropy(
            outputs, readouts, extra_ndim=readouts.ndim - 2)

    @application
    def costs(self, readouts):
        return -self.softmax.log_probabilities(
            readouts, extra_ndim=readouts.ndim - 2)

    @application
    def initial_outputs(self, batch_size):
        return self.initial_output * tensor.ones((batch_size,), dtype='int64')

    def get_dim(self, name):
        if name == 'outputs':
            return 0
        return super(SoftmaxEmitter, self).get_dim(name)
Exemplo n.º 11
0
def create_rnn(hidden_dim, vocab_dim, mode="rnn"):
    # input
    x = tensor.imatrix('inchar')
    y = tensor.imatrix('outchar')

    #
    W = LookupTable(
        name="W1",
        #dim = hidden_dim*4,
        dim=hidden_dim,
        length=vocab_dim,
        weights_init=initialization.IsotropicGaussian(0.01),
        biases_init=initialization.Constant(0))
    if mode == "lstm":
        # Long Short Term Memory
        H = LSTM(hidden_dim,
                 name='H',
                 weights_init=initialization.IsotropicGaussian(0.01),
                 biases_init=initialization.Constant(0.0))
    else:
        # recurrent history weight
        H = SimpleRecurrent(
            name="H",
            dim=hidden_dim,
            activation=Tanh(),
            weights_init=initialization.IsotropicGaussian(0.01))
    #
    S = Linear(name="W2",
               input_dim=hidden_dim,
               output_dim=vocab_dim,
               weights_init=initialization.IsotropicGaussian(0.01),
               biases_init=initialization.Constant(0))

    A = NDimensionalSoftmax(name="softmax")

    initLayers([W, H, S])
    activations = W.apply(x)
    hiddens = H.apply(activations)  #[0]
    activations2 = S.apply(hiddens)
    y_hat = A.apply(activations2, extra_ndim=1)
    cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean()

    cg = ComputationGraph(cost)
    #print VariableFilter(roles=[WEIGHT])(cg.variables)
    #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

    layers = (x, W, H, S, A, y)

    return cg, layers, y_hat, cost
def rating_cost(pred_score,
                true_ratings,
                input_masks,
                output_masks,
                D,
                d,
                std=1.0,
                alpha=0.01):

    pred_score_cum = T.extra_ops.cumsum(pred_score, axis=2)
    prob_item_ratings = NDimensionalSoftmax(name='rating_cost_sf').apply(
        pred_score_cum, extra_ndim=1)
    accu_prob_1N = T.extra_ops.cumsum(prob_item_ratings, axis=2)
    accu_prob_N1 = T.extra_ops.cumsum(prob_item_ratings[:, :, ::-1],
                                      axis=2)[:, :, ::-1]
    mask1N = T.extra_ops.cumsum(true_ratings[:, :, ::-1], axis=2)[:, :, ::-1]
    maskN1 = T.extra_ops.cumsum(true_ratings, axis=2)
    cost_ordinal_1N = -T.sum(
        (T.log(prob_item_ratings) - T.log(accu_prob_1N)) * mask1N, axis=2)
    cost_ordinal_N1 = -T.sum(
        (T.log(prob_item_ratings) - T.log(accu_prob_N1)) * maskN1, axis=2)
    cost_ordinal = cost_ordinal_1N + cost_ordinal_N1
    nll_item_ratings = -(true_ratings * T.log(prob_item_ratings)).sum(axis=2)
    nll = std * nll_item_ratings.sum(
        axis=1) * 1.0 * D / (D - d + 1e-6) + alpha * cost_ordinal.sum(
            axis=1) * 1.0 * D / (D - d + 1e-6)
    cost = T.mean(nll)
    return cost, nll, nll_item_ratings, cost_ordinal_1N, cost_ordinal_N1, prob_item_ratings
Exemplo n.º 13
0
 def __init__(self,
              lm_costs_name,
              lm_weight,
              normalize_am_weights=False,
              normalize_lm_weights=False,
              normalize_tot_weights=True,
              am_beta=1.0,
              **kwargs):
     super(ShallowFusionReadout, self).__init__(**kwargs)
     self.lm_costs_name = lm_costs_name
     self.lm_weight = lm_weight
     self.normalize_am_weights = normalize_am_weights
     self.normalize_lm_weights = normalize_lm_weights
     self.normalize_tot_weights = normalize_tot_weights
     self.am_beta = am_beta
     self.softmax = NDimensionalSoftmax()
     self.children += [self.softmax]
Exemplo n.º 14
0
    def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512):
        self.hidden_size = hidden_size
        self.input1_size = input1_size
        self.input2_size = input2_size
        self.lookup1_dim = lookup1_dim
        self.lookup2_dim = lookup2_dim

        x1 = tensor.lmatrix('durations')
        x2 = tensor.lmatrix('syllables')
        y = tensor.lmatrix('pitches')

        lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup1.initialize()
        lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup2.initialize()
        merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        merge.initialize()
        recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(),
                              weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3)
        recurrent_block.initialize()
        linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        linear.initialize()
        softmax = NDimensionalSoftmax()

        l1 = lookup1.apply(x1)
        l2 = lookup2.apply(x2)
        m = merge.apply(l1, l2)
        h = recurrent_block.apply(m)
        a = linear.apply(h)

        y_hat = softmax.apply(a, extra_ndim=1)
        # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D)

        self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean()

        self.ComputationGraph = ComputationGraph(self.Cost)

        self.Model = Model(y_hat)
Exemplo n.º 15
0
def softmax_output_layer(x, h, y, in_size, out_size, hidden_size, pred):
    if connect_h_to_o:
        hidden_to_output = Linear(name='hidden_to_output' + str(pred),
                                  input_dim=hidden_size * len(h),
                                  output_dim=out_size)
        hiddens = T.concatenate([hidden for hidden in h], axis=2)
    else:
        hidden_to_output = Linear(name='hidden_to_output' + str(pred),
                                  input_dim=hidden_size,
                                  output_dim=out_size)
        hiddens = h[-1]
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(hiddens)
    linear_output.name = 'linear_output'
    softmax = NDimensionalSoftmax()
    extra_ndim = 1 if single_dim_out else 2
    y_hat = softmax.apply(linear_output, extra_ndim=extra_ndim)
    cost = softmax.categorical_cross_entropy(y,
                                             linear_output,
                                             extra_ndim=extra_ndim).mean()

    return y_hat, cost
    def softmax_layer(self, h, y):
        """
        Perform Softmax over the hidden state in order to
        predict the next word in the sequence and compute
        the loss.
        :param h The hidden state sequence
        :param y The target words
        """
        hidden_to_output = Linear(name='hidden_to_output', input_dim=self.hidden_size,
                                  output_dim=self.vocab_size)
        initialize(hidden_to_output, sqrt(6.0 / (self.hidden_size + self.vocab_size)))

        linear_output = hidden_to_output.apply(h)
        linear_output.name = 'linear_output'
        softmax = NDimensionalSoftmax(name="lm_softmax")
        y_hat = softmax.log_probabilities(linear_output, extra_ndim=1)
        y_hat.name = 'y_hat'

        cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean()

        cost.name = 'cost'
        return y_hat, cost
Exemplo n.º 17
0
    def __init__(self, mlp, dim, k, const=1e-5, **kwargs):
        super(GMMMLP, self).__init__(**kwargs)

        self.dim = dim
        self.const = const
        self.k = k
        input_dim = mlp.output_dim
        self.mu = MLP(activations=[Identity()],
                      dims=[input_dim, dim],
                      name=self.name + "_mu")
        self.sigma = MLP(activations=[SoftPlus()],
                         dims=[input_dim, dim],
                         name=self.name + "_sigma")

        self.coeff = MLP(activations=[Identity()],
                         dims=[input_dim, k],
                         name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()
        self.mlp = mlp
        self.children = [
            self.mlp, self.mu, self.sigma, self.coeff, self.coeff2
        ]
Exemplo n.º 18
0
 def __init__(self, lm_costs_name, lm_weight,
              normalize_am_weights=False,
              normalize_lm_weights=False,
              normalize_tot_weights=True,
              am_beta=1.0,
              **kwargs):
     super(ShallowFusionReadout, self).__init__(**kwargs)
     self.lm_costs_name = lm_costs_name
     self.lm_weight = lm_weight
     self.normalize_am_weights = normalize_am_weights
     self.normalize_lm_weights = normalize_lm_weights
     self.normalize_tot_weights = normalize_tot_weights
     self.am_beta = am_beta
     self.softmax = NDimensionalSoftmax()
     self.children += [self.softmax]
Exemplo n.º 19
0
class GMMMLP(Initializable):
    """An mlp brick that branchs out to output
    sigma and mu for GMM
    Parameters
    ----------
    mlp: MLP brick
        the main mlp to wrap around.
    dim:
        output dim
    """
    def __init__(self, mlp, dim, k, const=1e-5, **kwargs):
        super(GMMMLP, self).__init__(**kwargs)

        self.dim = dim
        self.const = const
        self.k = k
        input_dim = mlp.output_dim
        self.mu = MLP(activations=[Identity()],
                      dims=[input_dim, dim],
                      name=self.name + "_mu")
        self.sigma = MLP(activations=[SoftPlus()],
                         dims=[input_dim, dim],
                         name=self.name + "_sigma")

        self.coeff = MLP(activations=[Identity()],
                         dims=[input_dim, k],
                         name=self.name + "_coeff")


        self.coeff2 = NDimensionalSoftmax()
        self.mlp = mlp
        self.children = [self.mlp, self.mu, 
                         self.sigma, self.coeff, self.coeff2]
        #self.children.extend(self.mlp.children)

    @application
    def apply(self, inputs):
        state = self.mlp.apply(inputs)
        mu = self.mu.apply(state)
        sigma = self.sigma.apply(state)
        coeff = self.coeff2.apply(self.coeff.apply(state),
            extra_ndim=state.ndim - 2) + self.const
        return mu, sigma, coeff

    @property
    def output_dim(self):
        return self.dim
Exemplo n.º 20
0
class GMMMLP(Initializable):
    """An mlp brick that branchs out to output
    sigma and mu for GMM
    Parameters
    ----------
    mlp: MLP brick
        the main mlp to wrap around.
    dim:
        output dim
    """
    def __init__(self, mlp, dim, k, const=1e-5, **kwargs):
        super(GMMMLP, self).__init__(**kwargs)

        self.dim = dim
        self.const = const
        self.k = k
        input_dim = mlp.output_dim
        self.mu = MLP(activations=[Identity()],
                      dims=[input_dim, dim],
                      name=self.name + "_mu")
        self.sigma = MLP(activations=[SoftPlus()],
                         dims=[input_dim, dim],
                         name=self.name + "_sigma")

        self.coeff = MLP(activations=[Identity()],
                         dims=[input_dim, k],
                         name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()
        self.mlp = mlp
        self.children = [
            self.mlp, self.mu, self.sigma, self.coeff, self.coeff2
        ]
        #self.children.extend(self.mlp.children)

    @application
    def apply(self, inputs):
        state = self.mlp.apply(inputs)
        mu = self.mu.apply(state)
        sigma = self.sigma.apply(state)
        coeff = self.coeff2.apply(self.coeff.apply(state),
                                  extra_ndim=state.ndim - 2) + self.const
        return mu, sigma, coeff

    @property
    def output_dim(self):
        return self.dim
Exemplo n.º 21
0
    def __init__(self, mlp, dim, k, const=1e-5, **kwargs):
        super(GMMMLP, self).__init__(**kwargs)

        self.dim = dim
        self.const = const
        self.k = k
        input_dim = mlp.output_dim
        self.mu = MLP(activations=[Identity()],
                      dims=[input_dim, dim],
                      name=self.name + "_mu")
        self.sigma = MLP(activations=[SoftPlus()],
                         dims=[input_dim, dim],
                         name=self.name + "_sigma")

        self.coeff = MLP(activations=[Identity()],
                         dims=[input_dim, k],
                         name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()
        self.mlp = mlp
        self.children = [self.mlp, self.mu,
                         self.sigma, self.coeff, self.coeff2]
Exemplo n.º 22
0
    def __init__(self, visual_dim, textual_dim, output_dim, hidden_size,
                 init_ranges, **kwargs):
        (visual_range, textual_range, linear_range_1, linear_range_2,
         linear_range_3) = init_ranges
        manager_dim = visual_dim + textual_dim
        visual_mlp = MLPGenreClassifier(
            visual_dim,
            output_dim,
            hidden_size, [linear_range_1, linear_range_2, linear_range_3],
            name='visual_mlp')
        textual_mlp = MLPGenreClassifier(
            textual_dim,
            output_dim,
            hidden_size, [linear_range_1, linear_range_2, linear_range_3],
            name='textual_mlp')
        # manager_mlp = MLPGenreClassifier(manager_dim, 2, hidden_size, [
        # linear_range_1, linear_range_2, linear_range_3], output_act=Softmax,
        # name='manager_mlp')
        bn = BatchNormalization(input_dim=manager_dim, name='bn3')
        manager_mlp = Sequence([
            Linear(manager_dim,
                   2,
                   name='linear_output',
                   use_bias=False,
                   weights_init=initialization.Uniform(
                       width=linear_range_1)).apply,
        ],
                               name='manager_mlp')
        fork = Fork(
            input_dim=manager_dim,
            output_dims=[2] * output_dim,
            prototype=manager_mlp,
            output_names=['linear_' + str(i) for i in range(output_dim)])

        children = [visual_mlp, textual_mlp, fork, bn, NDimensionalSoftmax()]
        kwargs.setdefault('use_bias', False)
        kwargs.setdefault('children', children)
        super(MoEClassifier, self).__init__(**kwargs)
    def __init__(self, config_dict, init_type="xavier", **kwargs):

        super(CharRNNModel, self).__init__(**kwargs)

        self.batch_size = config_dict["batch_size"]
        self.num_subwords = config_dict["num_subwords"]
        self.num_words = config_dict["num_words"]
        self.subword_embedding_size = config_dict["subword_embedding_size"]
        self.input_vocab_size = config_dict["input_vocab_size"]
        self.output_vocab_size = config_dict["output_vocab_size"]
        self.subword_RNN_hidden_state_size = config_dict["subword_RNN_hidden_state_size"]
        self.table_width = config_dict["table_width"]
        self.max_out_dim = config_dict["max_out_dim"]
        self.max_out_K = config_dict["max_out_K"]

        self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name="input_lookup")
        self.lookup.weights_init = Uniform(width=self.table_width)
        self.lookup.biases_init = Constant(0)

        if init_type == "xavier":
            linear_init = XavierInitializationOriginal(self.subword_embedding_size, self.subword_RNN_hidden_state_size)
            lstm_init = XavierInitializationOriginal(self.subword_embedding_size, self.subword_RNN_hidden_state_size)
        else:  # default is gaussian
            linear_init = IsotropicGaussian()
            lstm_init = IsotropicGaussian()

        # The `inputs` are then split in this order: Input gates, forget gates, cells and output gates
        self.linear_forward = Linear(
            input_dim=self.subword_embedding_size,
            output_dim=self.subword_RNN_hidden_state_size * 4,
            name="linear_forward",
            weights_init=linear_init,
            biases_init=Constant(0.0),
        )

        self.language_model = LSTM(
            dim=self.subword_RNN_hidden_state_size,
            activation=Tanh(),
            name="language_model_RNN",
            weights_init=lstm_init,
            biases_init=Constant(0.0),
        )

        self.max_out = LinearMaxout(
            self.subword_RNN_hidden_state_size,
            self.max_out_dim,
            self.max_out_K,
            name="max_out",
            weights_init=IsotropicGaussian(),
            biases_init=Constant(0.0),
        )

        self.softmax_linear = Linear(
            self.max_out_dim,
            self.output_vocab_size,
            name="soft_max_linear",
            weights_init=IsotropicGaussian(),
            biases_init=Constant(0.0),
        )

        self.softmax = NDimensionalSoftmax()

        self.children = [
            self.lookup,
            self.linear_forward,
            self.language_model,
            self.max_out,
            self.softmax_linear,
            self.softmax,
        ]
Exemplo n.º 24
0
class LanguageModel(Initializable):
    """The dictionary-equipped language model.

    Parameters
    ----------
    emb_dim: int
        The dimension of word embeddings (including for def model if standalone)
    dim : int
        The dimension of the RNNs states (including for def model if standalone)
    num_input_words : int
        The size of the LM's input vocabulary.
    num_output_words : int
        The size of the LM's output vocabulary.
    vocab
        The vocabulary object.
    retrieval
        The dictionary retrieval algorithm. If `None`, the language model
        does not use any dictionary.
    def_reader: either 'LSTM' or 'mean'
    standalone_def_rnn : bool
        If `True`, a standalone RNN with separate word embeddings is used
        to embed definition. If `False` the language model is reused.
    disregard_word_embeddings : bool
        If `True`, the word embeddings are not used, only the information
        from the definitions is used.
    compose_type : str
        If 'sum', the definition and word embeddings are averaged
        If 'fully_connected_linear', a learned perceptron compose the 2
        embeddings linearly
        If 'fully_connected_relu', ...
        If 'fully_connected_tanh', ...

    """
    def __init__(self,
                 emb_dim,
                 emb_def_dim,
                 dim,
                 num_input_words,
                 def_num_input_words,
                 num_output_words,
                 vocab,
                 retrieval=None,
                 def_reader='LSTM',
                 standalone_def_lookup=True,
                 standalone_def_rnn=True,
                 disregard_word_embeddings=False,
                 compose_type='sum',
                 very_rare_threshold=[10],
                 cache_size=0,
                 **kwargs):
        # TODO(tombosc): document
        if emb_dim == 0:
            emb_dim = dim
        if emb_def_dim == 0:
            emb_def_dim = emb_dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        if (num_input_words !=
                def_num_input_words) and (not standalone_def_lookup):
            raise NotImplementedError()

        self._very_rare_threshold = very_rare_threshold
        self._num_input_words = num_input_words
        self._num_output_words = num_output_words
        self._vocab = vocab
        self._retrieval = retrieval
        self._disregard_word_embeddings = disregard_word_embeddings
        self._compose_type = compose_type

        self._word_to_id = WordToIdOp(self._vocab)
        self._word_to_count = WordToCountOp(self._vocab)

        children = []
        self._cache = None
        if cache_size > 0:
            #TODO(tombosc) do we implement cache as LookupTable or theano matrix?
            #self._cache = theano.shared(np.zeros((def_num_input_words, emb_dim)))
            self._cache = LookupTable(cache_size,
                                      emb_dim,
                                      name='cache_def_embeddings')
            children.append(self._cache)

        if self._retrieval:
            self._retrieve = RetrievalOp(retrieval)

        self._main_lookup = LookupTable(self._num_input_words,
                                        emb_dim,
                                        name='main_lookup')
        self._main_fork = Linear(emb_dim, 4 * dim, name='main_fork')
        self._main_rnn = DebugLSTM(
            dim, name='main_rnn')  # TODO(tombosc): use regular LSTM?
        children.extend([self._main_lookup, self._main_fork, self._main_rnn])
        if self._retrieval:
            if standalone_def_lookup:
                lookup = None
            else:
                if emb_dim != emb_def_dim:
                    raise ValueError(
                        "emb_dim != emb_def_dim: cannot share lookup")
                lookup = self._main_lookup

            if def_reader == 'LSTM':
                if standalone_def_rnn:
                    fork_and_rnn = None
                else:
                    fork_and_rnn = (self._main_fork, self._main_rnn)
                self._def_reader = LSTMReadDefinitions(def_num_input_words,
                                                       emb_def_dim,
                                                       dim,
                                                       vocab,
                                                       lookup,
                                                       fork_and_rnn,
                                                       cache=self._cache)

            elif def_reader == 'mean':
                self._def_reader = MeanPoolReadDefinitions(
                    def_num_input_words,
                    emb_def_dim,
                    dim,
                    vocab,
                    lookup,
                    translate=(emb_def_dim != dim),
                    normalize=False)
            else:
                raise Exception("def reader not understood")

            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              compose_type=compose_type)

            children.extend([self._def_reader, self._combiner])

        self._pre_softmax = Linear(dim, self._num_output_words)
        self._softmax = NDimensionalSoftmax()
        children.extend([self._pre_softmax, self._softmax])

        super(LanguageModel, self).__init__(children=children, **kwargs)

    def _push_initialization_config(self):
        super(LanguageModel, self)._push_initialization_config()
        if self._cache:
            self._cache.weights_init = Constant(0.)

    def set_def_embeddings(self, embeddings):
        self._def_reader._def_lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    def get_def_embeddings_params(self):
        return self._def_reader._def_lookup.parameters[0]

    def get_cache_params(self):
        return self._cache.W

    def add_perplexity_measure(self, application_call, minus_logs, mask, name):
        costs = (minus_logs * mask).sum(axis=0)
        perplexity = tensor.exp(costs.sum() / mask.sum())
        perplexity.tag.aggregation_scheme = Perplexity(costs.sum(), mask.sum())
        full_name = "perplexity_" + name
        application_call.add_auxiliary_variable(perplexity, name=full_name)
        return costs

    @application
    def apply(self, application_call, words, mask):
        """Compute the log-likelihood for a batch of sequences.

        words
            An integer matrix of shape (B, T), where T is the number of time
            step, B is the batch size. Note that this order of the axis is
            different from what all RNN bricks consume, hence and the axis
            should be transposed at some point.
        mask
            A float32 matrix of shape (B, T). Zeros indicate the padding.

        """
        if self._retrieval:
            defs, def_mask, def_map = self._retrieve(words)
            def_embeddings = self._def_reader.apply(defs, def_mask)

            # Auxililary variable for debugging
            application_call.add_auxiliary_variable(def_embeddings.shape[0],
                                                    name="num_definitions")

        word_ids = self._word_to_id(words)

        # shortlisting
        input_word_ids = (
            tensor.lt(word_ids, self._num_input_words) * word_ids +
            tensor.ge(word_ids, self._num_input_words) * self._vocab.unk)
        output_word_ids = (
            tensor.lt(word_ids, self._num_output_words) * word_ids +
            tensor.ge(word_ids, self._num_output_words) * self._vocab.unk)

        application_call.add_auxiliary_variable(unk_ratio(
            input_word_ids, mask, self._vocab.unk),
                                                name='unk_ratio')

        # Run the main rnn with combined inputs
        word_embs = self._main_lookup.apply(input_word_ids)
        application_call.add_auxiliary_variable(masked_root_mean_square(
            word_embs, mask),
                                                name='word_emb_RMS')

        if self._retrieval:
            rnn_inputs, updated, positions = self._combiner.apply(
                word_embs, mask, def_embeddings, def_map)
        else:
            rnn_inputs = word_embs

        updates = []
        if self._cache:
            flat_word_ids = word_ids.flatten()
            flat_word_ids_to_update = flat_word_ids[positions]
            # computing updates for cache
            updates = [
                (self._cache.W,
                 tensor.set_subtensor(self._cache.W[flat_word_ids_to_update],
                                      updated))
            ]

        application_call.add_auxiliary_variable(masked_root_mean_square(
            word_embs, mask),
                                                name='main_rnn_in_RMS')

        main_rnn_states = self._main_rnn.apply(tensor.transpose(
            self._main_fork.apply(rnn_inputs), (1, 0, 2)),
                                               mask=mask.T)[0]

        # The first token is not predicted
        logits = self._pre_softmax.apply(main_rnn_states[:-1])
        targets = output_word_ids.T[1:]
        out_softmax = self._softmax.apply(logits, extra_ndim=1)
        application_call.add_auxiliary_variable(out_softmax.copy(),
                                                name="proba_out")
        minus_logs = self._softmax.categorical_cross_entropy(targets,
                                                             logits,
                                                             extra_ndim=1)

        targets_mask = mask.T[1:]
        costs = self.add_perplexity_measure(application_call, minus_logs,
                                            targets_mask, "")

        missing_embs = tensor.eq(input_word_ids,
                                 self._vocab.unk).astype('int32')  # (bs, L)
        self.add_perplexity_measure(application_call, minus_logs,
                                    targets_mask * missing_embs.T[:-1],
                                    "after_mis_word_embs")
        self.add_perplexity_measure(application_call, minus_logs,
                                    targets_mask * (1 - missing_embs.T[:-1]),
                                    "after_word_embs")

        word_counts = self._word_to_count(words)
        very_rare_masks = []
        for threshold in self._very_rare_threshold:
            very_rare_mask = tensor.lt(word_counts, threshold).astype('int32')
            very_rare_mask = targets_mask * (very_rare_mask.T[:-1])
            very_rare_masks.append(very_rare_mask)
            self.add_perplexity_measure(application_call, minus_logs,
                                        very_rare_mask,
                                        "after_very_rare_" + str(threshold))

        if self._retrieval:
            has_def = tensor.zeros_like(output_word_ids)
            has_def = tensor.inc_subtensor(
                has_def[def_map[:, 0], def_map[:, 1]], 1)
            mask_targets_has_def = has_def.T[:-1] * targets_mask  # (L-1, bs)
            self.add_perplexity_measure(application_call, minus_logs,
                                        mask_targets_has_def, "after_def_embs")

            for thresh, very_rare_mask in zip(self._very_rare_threshold,
                                              very_rare_masks):
                self.add_perplexity_measure(
                    application_call, minus_logs,
                    very_rare_mask * mask_targets_has_def,
                    "after_def_very_rare_" + str(thresh))

            application_call.add_auxiliary_variable(mask_targets_has_def.T,
                                                    name='mask_def_emb')

        return costs, updates
Exemplo n.º 25
0
    def __init__(self,
                 emb_dim,
                 emb_def_dim,
                 dim,
                 num_input_words,
                 def_num_input_words,
                 num_output_words,
                 vocab,
                 retrieval=None,
                 def_reader='LSTM',
                 standalone_def_lookup=True,
                 standalone_def_rnn=True,
                 disregard_word_embeddings=False,
                 compose_type='sum',
                 very_rare_threshold=[10],
                 cache_size=0,
                 **kwargs):
        # TODO(tombosc): document
        if emb_dim == 0:
            emb_dim = dim
        if emb_def_dim == 0:
            emb_def_dim = emb_dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        if (num_input_words !=
                def_num_input_words) and (not standalone_def_lookup):
            raise NotImplementedError()

        self._very_rare_threshold = very_rare_threshold
        self._num_input_words = num_input_words
        self._num_output_words = num_output_words
        self._vocab = vocab
        self._retrieval = retrieval
        self._disregard_word_embeddings = disregard_word_embeddings
        self._compose_type = compose_type

        self._word_to_id = WordToIdOp(self._vocab)
        self._word_to_count = WordToCountOp(self._vocab)

        children = []
        self._cache = None
        if cache_size > 0:
            #TODO(tombosc) do we implement cache as LookupTable or theano matrix?
            #self._cache = theano.shared(np.zeros((def_num_input_words, emb_dim)))
            self._cache = LookupTable(cache_size,
                                      emb_dim,
                                      name='cache_def_embeddings')
            children.append(self._cache)

        if self._retrieval:
            self._retrieve = RetrievalOp(retrieval)

        self._main_lookup = LookupTable(self._num_input_words,
                                        emb_dim,
                                        name='main_lookup')
        self._main_fork = Linear(emb_dim, 4 * dim, name='main_fork')
        self._main_rnn = DebugLSTM(
            dim, name='main_rnn')  # TODO(tombosc): use regular LSTM?
        children.extend([self._main_lookup, self._main_fork, self._main_rnn])
        if self._retrieval:
            if standalone_def_lookup:
                lookup = None
            else:
                if emb_dim != emb_def_dim:
                    raise ValueError(
                        "emb_dim != emb_def_dim: cannot share lookup")
                lookup = self._main_lookup

            if def_reader == 'LSTM':
                if standalone_def_rnn:
                    fork_and_rnn = None
                else:
                    fork_and_rnn = (self._main_fork, self._main_rnn)
                self._def_reader = LSTMReadDefinitions(def_num_input_words,
                                                       emb_def_dim,
                                                       dim,
                                                       vocab,
                                                       lookup,
                                                       fork_and_rnn,
                                                       cache=self._cache)

            elif def_reader == 'mean':
                self._def_reader = MeanPoolReadDefinitions(
                    def_num_input_words,
                    emb_def_dim,
                    dim,
                    vocab,
                    lookup,
                    translate=(emb_def_dim != dim),
                    normalize=False)
            else:
                raise Exception("def reader not understood")

            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              compose_type=compose_type)

            children.extend([self._def_reader, self._combiner])

        self._pre_softmax = Linear(dim, self._num_output_words)
        self._softmax = NDimensionalSoftmax()
        children.extend([self._pre_softmax, self._softmax])

        super(LanguageModel, self).__init__(children=children, **kwargs)
Exemplo n.º 26
0
class MinRiskInitialContextSequenceGenerator(InitialContextSequenceGenerator):
    def __init__(self, *args, **kwargs):
        self.softmax = NDimensionalSoftmax()
        super(MinRiskInitialContextSequenceGenerator,
              self).__init__(*args, **kwargs)
        self.children.append(self.softmax)

    @application
    def probs(self, readouts):
        return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2)

    # TODO: check where 'target_samples_mask' is used -- do we need a mask for context features (probably not)
    # Note: the @application decorator inspects the arguments, and transparently adds args  ('application_call')
    @application(inputs=[
        'representation', 'source_sentence_mask', 'target_samples_mask',
        'target_samples', 'scores'
    ],
                 outputs=['cost'])
    def expected_cost(self,
                      application_call,
                      representation,
                      source_sentence_mask,
                      target_samples,
                      target_samples_mask,
                      scores,
                      smoothing_constant=0.005,
                      **kwargs):
        """
        emulate the process in sequence_generator.cost_matrix, but compute log probabilities instead of costs
        for each sample, we need its probability according to the model (these could actually be passed from the
        sampling model, which could be more efficient)
        """

        # Transpose everything (note we can use transpose here only if it's 2d, otherwise we need dimshuffle)
        source_sentence_mask = source_sentence_mask.T

        # make samples (time, batch)
        samples = target_samples.T
        samples_mask = target_samples_mask.T

        # we need this to set the 'attended' kwarg
        keywords = {
            'mask': target_samples_mask,
            'outputs': target_samples,
            'attended': representation,
            'attended_mask': source_sentence_mask
        }

        batch_size = samples.shape[1]

        # Prepare input for the iterative part
        states = dict_subset(keywords, self._state_names, must_have=False)
        # masks in context are optional (e.g. `attended_mask`)
        # contexts = dict_subset(keywords, self._context_names, must_have=False)

        # add the initial state context features
        contexts = dict_subset(keywords, self._context_names, must_have=False)
        contexts['initial_state_context'] = kwargs['initial_state_context']

        feedback = self.readout.feedback(samples)
        inputs = self.fork.apply(feedback, as_dict=True)

        # Run the recurrent network
        results = self.transition.apply(mask=samples_mask,
                                        return_initial_states=True,
                                        as_dict=True,
                                        **dict_union(inputs, states, contexts))

        # Separate the deliverables. The last states are discarded: they
        # are not used to predict any output symbol. The initial glimpses
        # are discarded because they are not used for prediction.
        # Remember, glimpses are computed _before_ output stage, states are
        # computed after.
        states = {name: results[name][:-1] for name in self._state_names}
        glimpses = {name: results[name][1:] for name in self._glimpse_names}

        # Compute the cost
        feedback = tensor.roll(feedback, 1, 0)
        feedback = tensor.set_subtensor(
            feedback[0],
            self.readout.feedback(self.readout.initial_outputs(batch_size)))
        readouts = self.readout.readout(feedback=feedback,
                                        **dict_union(states, glimpses,
                                                     contexts))

        word_probs = self.probs(readouts)
        word_probs = tensor.log(word_probs)

        # Note: converting the samples to one-hot wastes space, but it gets the job done
        # TODO: this may be the op that sometimes causes out-of-memory
        one_hot_samples = tensor.eye(word_probs.shape[-1])[samples]
        one_hot_samples.astype('float32')
        actual_probs = word_probs * one_hot_samples

        # reshape to (batch, time, prob), then sum over the batch dimension
        # to get sequence-level probability
        actual_probs = actual_probs.dimshuffle(1, 0, 2)
        # we are first summing over vocabulary (only one non-zero cell per row)
        sequence_probs = actual_probs.sum(axis=2)
        sequence_probs = sequence_probs * target_samples_mask
        # now sum over time dimension
        sequence_probs = sequence_probs.sum(axis=1)

        # reshape and do exp() to get the true probs back
        # sequence_probs = tensor.exp(sequence_probs.reshape(scores.shape))
        sequence_probs = sequence_probs.reshape(scores.shape)

        # Note that the smoothing constant can be set by user
        sequence_distributions = (
            tensor.exp(sequence_probs * smoothing_constant) /
            tensor.exp(sequence_probs * smoothing_constant).sum(axis=1,
                                                                keepdims=True))

        # the following lines are done explicitly for code clarity
        # -- first get sequence expectation, then sum up the expectations for every
        # seq in the minibatch
        expected_scores = (sequence_distributions * scores).sum(axis=1)
        expected_scores = expected_scores.sum(axis=0)

        return expected_scores
Exemplo n.º 27
0
    def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs):

        super(FRNNEmitter, self).__init__(**kwargs)

        self.mlp = mlp
        self.target_size = target_size
        self.frame_size = frame_size
        self.k = k
        self.frnn_hidden_size = frnn_hidden_size
        self.const = const
        self.input_dim = self.mlp.output_dim

        self.frnn_step_size = frnn_step_size

        # adding a step if the division is not exact.
        self.number_of_steps = frame_size // frnn_step_size
        self.last_steps = frame_size % frnn_step_size
        if self.last_steps != 0:
            self.number_of_steps += 1

        self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu")
        self.sigma = MLP(
            activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma"
        )

        self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()

        self.frnn_initial_state = Linear(
            input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state"
        )

        # self.frnn_hidden = Linear(
        #    input_dim=frnn_hidden_size,
        #    output_dim=frnn_hidden_size,
        #    activation=Tanh(),
        #    name="frnn_hidden")

        self.frnn_activation = Tanh(name="frnn_activation")

        self.frnn_linear_transition_state = Linear(
            input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state"
        )

        self.frnn_linear_transition_input = Linear(
            input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input"
        )

        # self.frnn_linear_transition_output = Linear (
        #    input_dim = frnn_hidden_size,
        #    output_dim = self.rnn_hidden_dim,
        #    name="frnn_linear_transition_output")

        self.children = [
            self.mlp,
            self.mu,
            self.sigma,
            self.coeff,
            self.coeff2,
            self.frnn_initial_state,
            self.frnn_activation,
            self.frnn_linear_transition_state,
            self.frnn_linear_transition_input,
        ]
Exemplo n.º 28
0
    def __init__(self,
                 emb_dim,
                 dim,
                 num_input_words,
                 num_output_words,
                 vocab,
                 proximity_coef=0,
                 proximity_distance='l2',
                 encoder='lstm',
                 decoder='lstm',
                 shared_rnn=False,
                 translate_layer=None,
                 word_dropout=0.,
                 tied_in_out=False,
                 vocab_keys=None,
                 seed=0,
                 reconstruction_coef=1.,
                 provide_targets=False,
                 **kwargs):
        """
        translate_layer: either a string containing the activation function to use
                         either a list containg the list of activations for a MLP
        """
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if num_output_words == 0:
            num_output_words = vocab.size()

        self._word_dropout = word_dropout

        self._tied_in_out = tied_in_out

        if not encoder:
            if proximity_coef:
                raise ValueError("Err: meaningless penalty term (no encoder)")
            if not vocab_keys:
                raise ValueError("Err: specify a key vocabulary (no encoder)")

        if tied_in_out and num_input_words != num_output_words:
            raise ValueError("Can't tie in and out embeddings. Different "
                             "vocabulary size")
        if shared_rnn and (encoder != 'lstm' or decoder != 'lstm'):
            raise ValueError(
                "can't share RNN because either encoder or decoder"
                "is not an RNN")
        if shared_rnn and decoder == 'lstm_c':
            raise ValueError(
                "can't share RNN because the decoder takes different"
                "inputs")
        if word_dropout < 0 or word_dropout > 1:
            raise ValueError("invalid value for word dropout",
                             str(word_dropout))
        if proximity_distance not in ['l1', 'l2', 'cos']:
            raise ValueError(
                "unrecognized distance: {}".format(proximity_distance))

        if proximity_coef and emb_dim != dim and not translate_layer:
            raise ValueError(
                """if proximity penalisation, emb_dim should equal dim or 
                              there should be a translate layer""")

        if encoder not in [
                None, 'lstm', 'bilstm', 'mean', 'weighted_mean', 'max_bilstm',
                'bilstm_sum', 'max_bilstm_sum'
        ]:
            raise ValueError('encoder not recognized')
        if decoder not in ['skip-gram', 'lstm', 'lstm_c']:
            raise ValueError('decoder not recognized')

        self._proximity_distance = proximity_distance
        self._decoder = decoder
        self._encoder = encoder
        self._num_input_words = num_input_words
        self._num_output_words = num_output_words
        self._vocab = vocab
        self._proximity_coef = proximity_coef
        self._reconstruction_coef = reconstruction_coef
        self._provide_targets = provide_targets

        self._word_to_id = WordToIdOp(self._vocab)
        if vocab_keys:
            self._key_to_id = WordToIdOp(vocab_keys)

        children = []

        if encoder or (not encoder and decoder in ['lstm', 'lstm_c']):
            self._main_lookup = LookupTable(self._num_input_words,
                                            emb_dim,
                                            name='main_lookup')
            children.append(self._main_lookup)
        if provide_targets:
            # this is useful to simulate Hill's baseline without pretrained embeddings
            # in the encoder, only as targets for the encoder.
            self._target_lookup = LookupTable(self._num_input_words,
                                              emb_dim,
                                              name='target_lookup')
            children.append(self._target_lookup)
        if not encoder:
            self._key_lookup = LookupTable(vocab_keys.size(),
                                           emb_dim,
                                           name='key_lookup')
            children.append(self._key_lookup)
        elif encoder == 'lstm':
            self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
            self._encoder_rnn = LSTM(dim, name='encoder_rnn')
            children.extend([self._encoder_fork, self._encoder_rnn])
        elif encoder in ['bilstm', 'max_bilstm']:
            # dim is the dim of the concatenated vector
            self._encoder_fork = Linear(emb_dim, 2 * dim, name='encoder_fork')
            self._encoder_rnn = Bidirectional(LSTM(dim / 2,
                                                   name='encoder_rnn'))
            children.extend([self._encoder_fork, self._encoder_rnn])
        elif encoder in ['bilstm_sum', 'max_bilstm_sum']:
            self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
            self._encoder_rnn = BidirectionalSum(LSTM(dim, name='encoder_rnn'))
            children.extend([self._encoder_fork, self._encoder_rnn])
        elif encoder == 'mean':
            pass
        elif encoder == 'weighted_mean':
            self._encoder_w = MLP([Logistic()], [dim, 1],
                                  name="encoder_weights")
            children.extend([self._encoder_w])
        else:
            raise NotImplementedError()

        if decoder in ['lstm', 'lstm_c']:
            dim_after_translate = emb_dim
            if shared_rnn:
                self._decoder_fork = self._encoder_fork
                self._decoder_rnn = self._encoder_rnn
            else:
                if decoder == 'lstm_c':
                    dim_2 = dim + emb_dim
                else:
                    dim_2 = dim
                self._decoder_fork = Linear(dim_2,
                                            4 * dim,
                                            name='decoder_fork')
                self._decoder_rnn = LSTM(dim, name='decoder_rnn')
            children.extend([self._decoder_fork, self._decoder_rnn])
        elif decoder == 'skip-gram':
            dim_after_translate = emb_dim

        self._translate_layer = None
        activations = {'sigmoid': Logistic(), 'tanh': Tanh(), 'linear': None}

        if translate_layer:
            if type(translate_layer) == str:
                translate_layer = [translate_layer]
            assert (type(translate_layer) == list)
            activations_translate = [activations[a] for a in translate_layer]
            dims_translate = [
                dim,
            ] * len(translate_layer) + [dim_after_translate]
            self._translate_layer = MLP(activations_translate,
                                        dims_translate,
                                        name="translate_layer")
            children.append(self._translate_layer)

        if not self._tied_in_out:
            self._pre_softmax = Linear(emb_dim, self._num_output_words)
            children.append(self._pre_softmax)
        if decoder in ['lstm', 'lstm_c']:
            self._softmax = NDimensionalSoftmax()
        elif decoder in ['skip-gram']:
            self._softmax = Softmax()
        children.append(self._softmax)

        super(Seq2Seq, self).__init__(children=children, **kwargs)
class ExtractiveQAModel(Initializable):
    """The dictionary-equipped extractive QA model.

    Parameters
    ----------
    dim : int
        The default dimensionality for the components.
    emd_dim : int
        The dimensionality for the embeddings. If 0, `dim` is used.
    coattention : bool
        Use the coattention mechanism.
    num_input_words : int
        The number of input words. If 0, `vocab.size()` is used.
        The vocabulary object.
    use_definitions : bool
        Triggers the use of definitions.
    reuse_word_embeddings : bool
    compose_type : str

    """
    def __init__(self, dim, emb_dim, readout_dims, num_input_words,
                 def_num_input_words, vocab, use_definitions, def_word_gating,
                 compose_type, coattention, def_reader, reuse_word_embeddings,
                 random_unk, **kwargs):
        self._vocab = vocab
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        self._coattention = coattention
        self._num_input_words = num_input_words
        self._use_definitions = use_definitions
        self._random_unk = random_unk
        self._reuse_word_embeddings = reuse_word_embeddings

        lookup_num_words = num_input_words
        if reuse_word_embeddings:
            lookup_num_words = max(num_input_words, def_num_input_words)
        if random_unk:
            lookup_num_words = vocab.size()

        # Dima: we can have slightly less copy-paste here if we
        # copy the RecurrentFromFork class from my other projects.
        children = []
        self._lookup = LookupTable(lookup_num_words, emb_dim)
        self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
        self._encoder_rnn = LSTM(dim, name='encoder_rnn')
        self._question_transform = Linear(dim, dim, name='question_transform')
        self._bidir_fork = Linear(3 * dim if coattention else 2 * dim,
                                  4 * dim,
                                  name='bidir_fork')
        self._bidir = Bidirectional(LSTM(dim), name='bidir')
        children.extend([
            self._lookup, self._encoder_fork, self._encoder_rnn,
            self._question_transform, self._bidir, self._bidir_fork
        ])

        activations = [Rectifier()] * len(readout_dims) + [None]
        readout_dims = [2 * dim] + readout_dims + [1]
        self._begin_readout = MLP(activations,
                                  readout_dims,
                                  name='begin_readout')
        self._end_readout = MLP(activations, readout_dims, name='end_readout')
        self._softmax = NDimensionalSoftmax()
        children.extend(
            [self._begin_readout, self._end_readout, self._softmax])

        if self._use_definitions:
            # A potential bug here: we pass the same vocab to the def reader.
            # If a different token is reserved for UNK in text and in the definitions,
            # we can be screwed.
            def_reader_class = eval(def_reader)
            def_reader_kwargs = dict(
                num_input_words=def_num_input_words,
                dim=dim,
                emb_dim=emb_dim,
                vocab=vocab,
                lookup=self._lookup if reuse_word_embeddings else None)
            if def_reader_class == MeanPoolReadDefinitions:
                def_reader_kwargs.update(dict(normalize=True, translate=False))
            self._def_reader = def_reader_class(**def_reader_kwargs)
            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              def_word_gating=def_word_gating,
                                              compose_type=compose_type)
            children.extend([self._def_reader, self._combiner])

        super(ExtractiveQAModel, self).__init__(children=children, **kwargs)

        # create default input variables
        self.contexts = tensor.lmatrix('contexts')
        self.context_mask = tensor.matrix('contexts_mask')
        self.questions = tensor.lmatrix('questions')
        self.question_mask = tensor.matrix('questions_mask')
        self.answer_begins = tensor.lvector('answer_begins')
        self.answer_ends = tensor.lvector('answer_ends')
        input_vars = [
            self.contexts, self.context_mask, self.questions,
            self.question_mask, self.answer_begins, self.answer_ends
        ]
        if self._use_definitions:
            self.defs = tensor.lmatrix('defs')
            self.def_mask = tensor.matrix('def_mask')
            self.contexts_def_map = tensor.lmatrix('contexts_def_map')
            self.questions_def_map = tensor.lmatrix('questions_def_map')
            input_vars.extend([
                self.defs, self.def_mask, self.contexts_def_map,
                self.questions_def_map
            ])
        self.input_vars = OrderedDict([(var.name, var) for var in input_vars])

    def set_embeddings(self, embeddings):
        self._lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    def embeddings_var(self):
        return self._lookup.parameters[0]

    def def_reading_parameters(self):
        parameters = Selector(self._def_reader).get_parameters().values()
        parameters.extend(Selector(self._combiner).get_parameters().values())
        if self._reuse_word_embeddings:
            lookup_parameters = Selector(
                self._lookup).get_parameters().values()
            parameters = [p for p in parameters if p not in lookup_parameters]
        return parameters

    @application
    def _encode(self,
                application_call,
                text,
                mask,
                def_embs=None,
                def_map=None,
                text_name=None):
        if not self._random_unk:
            text = (tensor.lt(text, self._num_input_words) * text +
                    tensor.ge(text, self._num_input_words) * self._vocab.unk)
        if text_name:
            application_call.add_auxiliary_variable(
                unk_ratio(text, mask, self._vocab.unk),
                name='{}_unk_ratio'.format(text_name))
        embs = self._lookup.apply(text)
        if self._random_unk:
            embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs +
                    tensor.ge(text, self._num_input_words)[:, :, None] *
                    disconnected_grad(embs))
        if def_embs:
            embs = self._combiner.apply(embs, mask, def_embs, def_map)
        add_role(embs, EMBEDDINGS)
        encoded = flip01(
            self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)),
                                    mask=mask.T)[0])
        return encoded

    @application
    def apply(self,
              application_call,
              contexts,
              contexts_mask,
              questions,
              questions_mask,
              answer_begins,
              answer_ends,
              defs=None,
              def_mask=None,
              contexts_def_map=None,
              questions_def_map=None):
        def_embs = None
        if self._use_definitions:
            def_embs = self._def_reader.apply(defs, def_mask)

        context_enc = self._encode(contexts, contexts_mask, def_embs,
                                   contexts_def_map, 'context')
        question_enc_pre = self._encode(questions, questions_mask, def_embs,
                                        questions_def_map, 'question')
        question_enc = tensor.tanh(
            self._question_transform.apply(question_enc_pre))

        # should be (batch size, context length, question_length)
        affinity = tensor.batched_dot(context_enc, flip12(question_enc))
        affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :]
        affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask)
        # soft-aligns every position in the context to positions in the question
        d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1)
        application_call.add_auxiliary_variable(d2q_att_weights.copy(),
                                                name='d2q_att_weights')
        # soft-aligns every position in the question to positions in the document
        q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1)
        application_call.add_auxiliary_variable(q2d_att_weights.copy(),
                                                name='q2d_att_weights')

        # question encoding "in the view of the document"
        question_enc_informed = tensor.batched_dot(q2d_att_weights,
                                                   context_enc)
        question_enc_concatenated = tensor.concatenate(
            [question_enc, question_enc_informed], 2)
        # document encoding "in the view of the question"
        context_enc_informed = tensor.batched_dot(d2q_att_weights,
                                                  question_enc_concatenated)

        if self._coattention:
            context_enc_concatenated = tensor.concatenate(
                [context_enc, context_enc_informed], 2)
        else:
            question_repr_repeated = tensor.repeat(question_enc[:, [-1], :],
                                                   context_enc.shape[1],
                                                   axis=1)
            context_enc_concatenated = tensor.concatenate(
                [context_enc, question_repr_repeated], 2)

        # note: forward and backward LSTMs share the
        # input weights in the current impl
        bidir_states = flip01(
            self._bidir.apply(self._bidir_fork.apply(
                flip01(context_enc_concatenated)),
                              mask=contexts_mask.T)[0])

        begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0]
        begin_readouts = begin_readouts * contexts_mask - 1000.0 * (
            1 - contexts_mask)
        begin_costs = self._softmax.categorical_cross_entropy(
            answer_begins, begin_readouts)

        end_readouts = self._end_readout.apply(bidir_states)[:, :, 0]
        end_readouts = end_readouts * contexts_mask - 1000.0 * (1 -
                                                                contexts_mask)
        end_costs = self._softmax.categorical_cross_entropy(
            answer_ends, end_readouts)

        predicted_begins = begin_readouts.argmax(axis=-1)
        predicted_ends = end_readouts.argmax(axis=-1)
        exact_match = (tensor.eq(predicted_begins, answer_begins) *
                       tensor.eq(predicted_ends, answer_ends))
        application_call.add_auxiliary_variable(predicted_begins,
                                                name='predicted_begins')
        application_call.add_auxiliary_variable(predicted_ends,
                                                name='predicted_ends')
        application_call.add_auxiliary_variable(exact_match,
                                                name='exact_match')

        return begin_costs + end_costs

    def apply_with_default_vars(self):
        return self.apply(*self.input_vars.values())
Exemplo n.º 30
0
    def __init__(self, input_sources_list, input_sources_vocab_size_list,
                 output_source, output_source_vocab_size,
                 lookup_dim=200, hidden_size=256, recurrent_stack_size=1):

        self.InputSources = input_sources_list
        self.InputSourcesVocab = input_sources_vocab_size_list
        self.OutputSource = output_source
        self.OutputSourceVocab = output_source_vocab_size

        inputs = [tensor.lmatrix(source) for source in input_sources_list]
        output = tensor.lmatrix(output_source)

        lookups = self.get_lookups(lookup_dim, input_sources_vocab_size_list)

        for lookup in lookups:
            lookup.initialize()

        merge = Merge([lookup.name for lookup in lookups], [lookup.dim for lookup in lookups], hidden_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        merge.initialize()

        linear0 = Linear(input_dim=hidden_size, output_dim=hidden_size,
                        weights_init=initialization.Uniform(width=0.01),
                        biases_init=Constant(0), name='linear0')
        linear0.initialize()

        recurrent_blocks = []

        for i in range(recurrent_stack_size):
            recurrent_blocks.append(SimpleRecurrent(
                dim=hidden_size, activation=Tanh(),
                weights_init=initialization.Uniform(width=0.01),
                use_bias=False))

        for i, recurrent_block in enumerate(recurrent_blocks):
            recurrent_block.name = 'recurrent'+str(i+1)
            recurrent_block.initialize()

        linear_out = Linear(input_dim=hidden_size, output_dim=output_source_vocab_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0), name='linear_out')
        linear_out.initialize()
        softmax = NDimensionalSoftmax(name='softmax')

        lookup_outputs = [lookup.apply(input) for lookup, input in zip(lookups, inputs)]

        m = merge.apply(*lookup_outputs)
        r = linear0.apply(m)
        for block in recurrent_blocks:
            r = block.apply(r)
        a = linear_out.apply(r)

        self.Cost = softmax.categorical_cross_entropy(output, a, extra_ndim=1).mean()
        self.Cost.name = 'cost'

        y_hat = softmax.apply(a, extra_ndim=1)
        y_hat.name = 'y_hat'

        self.ComputationGraph = ComputationGraph(self.Cost)

        self.Function = None
        self.MainLoop = None
        self.Model = Model(y_hat)
Exemplo n.º 31
0
rnn = SimpleRecurrent(
    name='hidden',
    dim=hidden_layer_dim,
    activation=Tanh(),
    weights_init=initialization.Uniform(width=0.01))
rnn.initialize()

linear_output = Linear(
    name='linear_output',
    input_dim=hidden_layer_dim,
    output_dim=charset_size,
    weights_init=initialization.Uniform(width=0.01),
    biases_init=Constant(0))
linear_output.initialize()

softmax = NDimensionalSoftmax(name='ndim_softmax')

activation_input = lookup_input.apply(x)
hidden = rnn.apply(linear_input.apply(activation_input))
activation_output = linear_output.apply(hidden)
y_est = softmax.apply(activation_output, extra_ndim=1)

cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean()


from blocks.graph import ComputationGraph
from blocks.algorithms import GradientDescent, Adam

cg = ComputationGraph([cost])

step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)]
Exemplo n.º 32
0
    parser.add_argument('-temperature', type=float,
                        default=1.0, help='temperature of sampling')
    args = parser.parse_args()

    # Define primetext
    ix_to_char, char_to_ix, vocab_size = get_metadata(hdf5_file)
    if args.primetext and len(args.primetext) > 0:
        primetext = ''.join(
            [ch for ch in args.primetext if ch in char_to_ix.keys()])
        x_curr = numpy.expand_dims(
            numpy.array([char_to_ix[ch] for ch in primetext], dtype='uint8'), axis=1)
    else:
        dev_stream = get_stream(hdf5_file, 'dev', batch_size)
        x_curr, y_curr = dev_stream.get_epoch_iterator().next()
        x_curr = x_curr[:, -1].reshape(seq_length, 1)

    print 'Loading model from {0}...'.format(args.model)
    main_loop = load(args.model)
    print 'Model loaded. Building prediction function...'
    model = main_loop.model
    y, x = model.inputs
    softmax = NDimensionalSoftmax()
    linear_output = [
        v for v in model.variables if v.name == 'linear_output'][0]
    y_hat = softmax.apply(linear_output, extra_ndim=1)
    predict = theano.function([x], y_hat)

    print 'Starting sampling'
    sample_string = sample(args.length, x_curr, predict, ix_to_char,
                           seed=args.seed, temperature=args.temperature)
class CharRNNModel(Initializable):
    """
    A model for testing that the components of my more complex models work.

    This is just a model that predicts one character at a time using a LSTM layer
    """

    def __init__(self, config_dict, init_type="xavier", **kwargs):

        super(CharRNNModel, self).__init__(**kwargs)

        self.batch_size = config_dict["batch_size"]
        self.num_subwords = config_dict["num_subwords"]
        self.num_words = config_dict["num_words"]
        self.subword_embedding_size = config_dict["subword_embedding_size"]
        self.input_vocab_size = config_dict["input_vocab_size"]
        self.output_vocab_size = config_dict["output_vocab_size"]
        self.subword_RNN_hidden_state_size = config_dict["subword_RNN_hidden_state_size"]
        self.table_width = config_dict["table_width"]
        self.max_out_dim = config_dict["max_out_dim"]
        self.max_out_K = config_dict["max_out_K"]

        self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name="input_lookup")
        self.lookup.weights_init = Uniform(width=self.table_width)
        self.lookup.biases_init = Constant(0)

        if init_type == "xavier":
            linear_init = XavierInitializationOriginal(self.subword_embedding_size, self.subword_RNN_hidden_state_size)
            lstm_init = XavierInitializationOriginal(self.subword_embedding_size, self.subword_RNN_hidden_state_size)
        else:  # default is gaussian
            linear_init = IsotropicGaussian()
            lstm_init = IsotropicGaussian()

        # The `inputs` are then split in this order: Input gates, forget gates, cells and output gates
        self.linear_forward = Linear(
            input_dim=self.subword_embedding_size,
            output_dim=self.subword_RNN_hidden_state_size * 4,
            name="linear_forward",
            weights_init=linear_init,
            biases_init=Constant(0.0),
        )

        self.language_model = LSTM(
            dim=self.subword_RNN_hidden_state_size,
            activation=Tanh(),
            name="language_model_RNN",
            weights_init=lstm_init,
            biases_init=Constant(0.0),
        )

        self.max_out = LinearMaxout(
            self.subword_RNN_hidden_state_size,
            self.max_out_dim,
            self.max_out_K,
            name="max_out",
            weights_init=IsotropicGaussian(),
            biases_init=Constant(0.0),
        )

        self.softmax_linear = Linear(
            self.max_out_dim,
            self.output_vocab_size,
            name="soft_max_linear",
            weights_init=IsotropicGaussian(),
            biases_init=Constant(0.0),
        )

        self.softmax = NDimensionalSoftmax()

        self.children = [
            self.lookup,
            self.linear_forward,
            self.language_model,
            self.max_out,
            self.softmax_linear,
            self.softmax,
        ]

    @application(inputs=["features", "features_mask", "targets", "targets_mask"], outputs=["cost"])
    def apply(self, features, features_mask, targets, targets_mask):

        subword_embeddings = self.lookup.apply(features)
        sentence_embeddings = self.language_model.apply(
            self.linear_forward.apply(subword_embeddings), mask=features_mask
        )[
            0
        ]  # [0] = hidden states, [1] = cells

        linear_output = self.softmax_linear.apply(self.max_out.apply(sentence_embeddings))
        cost = self.softmax.categorical_cross_entropy(targets, linear_output, extra_ndim=1).mean()
        cost.name = "cost"
        return ((cost * targets_mask).sum()) / targets_mask.sum()
Exemplo n.º 34
0
 def __init__(self, initial_output=0, **kwargs):
     self.initial_output = initial_output
     self.softmax = NDimensionalSoftmax()
     children = [self.softmax]
     kwargs.setdefault('children', []).extend(children)
     super(SoftmaxEmitter, self).__init__(**kwargs)
 def __init__(self, initial_output=0, **kwargs):
     super(SoftmaxEmitter, self).__init__(**kwargs)
     self.initial_output = initial_output
     self.softmax = NDimensionalSoftmax()
     self.children = [self.softmax]
Exemplo n.º 36
0
    def __init__(
            self,
            dim,
            emb_dim,
            vocab,
            def_emb_translate_dim=-1,
            def_dim=-1,
            encoder='bilstm',
            bn=True,
            def_reader=None,
            def_combiner=None,
            dropout=0.5,
            num_input_words=-1,
            # Others
            **kwargs):

        self._dropout = dropout
        self._vocab = vocab
        self._emb_dim = emb_dim
        self._def_reader = def_reader
        self._def_combiner = def_combiner

        if encoder != 'bilstm':
            raise NotImplementedError()

        if def_emb_translate_dim < 0:
            self.def_emb_translate_dim = emb_dim
        else:
            self.def_emb_translate_dim = def_emb_translate_dim

        if def_dim < 0:
            self._def_dim = emb_dim
        else:
            self._def_dim = def_dim

        if num_input_words > 0:
            logger.info("Restricting vocab to " + str(num_input_words))
            self._num_input_words = num_input_words
        else:
            self._num_input_words = vocab.size()

        children = []

        if self.def_emb_translate_dim != self._emb_dim:
            self._translate_pre_def = Linear(input_dim=emb_dim,
                                             output_dim=def_emb_translate_dim)
            children.append(self._translate_pre_def)
        else:
            self._translate_pre_def = None

        ## Embedding
        self._lookup = LookupTable(self._num_input_words,
                                   emb_dim,
                                   weights_init=GlorotUniform())
        children.append(self._lookup)

        if def_reader:
            self._final_emb_dim = self._def_dim
            self._def_reader = def_reader
            self._def_combiner = def_combiner
            children.extend([self._def_reader, self._def_combiner])
        else:
            self._final_emb_dim = self._emb_dim

        ## BiLSTM
        self._hyp_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='hyp_bidir_fork')
        self._hyp_bidir = Bidirectional(LSTM(dim), name='hyp_bidir')
        self._prem_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='prem_bidir_fork')
        self._prem_bidir = Bidirectional(LSTM(dim), name='prem_bidir')
        children.extend([self._hyp_bidir_fork, self._hyp_bidir])
        children.extend([self._prem_bidir, self._prem_bidir_fork])

        ## BiLSTM no. 2 (encoded attentioned embeddings)
        self._hyp_bidir_fork2 = Linear(8 * dim,
                                       4 * dim,
                                       name='hyp_bidir_fork2')
        self._hyp_bidir2 = Bidirectional(LSTM(dim), name='hyp_bidir2')
        self._prem_bidir_fork2 = Linear(8 * dim,
                                        4 * dim,
                                        name='prem_bidir_fork2')
        self._prem_bidir2 = Bidirectional(LSTM(dim), name='prem_bidir2')
        children.extend([self._hyp_bidir_fork2, self._hyp_bidir2])
        children.extend([self._prem_bidir2, self._prem_bidir_fork2])

        self._rnns = [
            self._prem_bidir2, self._hyp_bidir2, self._prem_bidir,
            self._hyp_bidir
        ]

        ## MLP
        if bn:
            self._mlp = BatchNormalizedMLP([Tanh()], [8 * dim, dim],
                                           conserve_memory=False,
                                           name="mlp")
            self._pred = BatchNormalizedMLP([Softmax()], [dim, 3],
                                            conserve_memory=False,
                                            name="pred_mlp")
        else:
            self._mlp = MLP([Tanh()], [8 * dim, dim], name="mlp")
            self._pred = MLP([Softmax()], [dim, 3], name="pred_mlp")

        children.append(self._mlp)
        children.append(self._pred)

        ## Softmax
        self._ndim_softmax = NDimensionalSoftmax()
        children.append(self._ndim_softmax)

        super(ESIM, self).__init__(children=children, **kwargs)
Exemplo n.º 37
0
linear_input.initialize()

rnn = SimpleRecurrent(name='hidden',
                      dim=hidden_layer_dim,
                      activation=Tanh(),
                      weights_init=initialization.Uniform(width=0.01))
rnn.initialize()

linear_output = Linear(name='linear_output',
                       input_dim=hidden_layer_dim,
                       output_dim=train_dataset.durations_vocab_size(),
                       weights_init=initialization.Uniform(width=0.01),
                       biases_init=Constant(0))
linear_output.initialize()

softmax = NDimensionalSoftmax(name='ndim_softmax')

activation_input = lookup_input.apply(x)
hidden = rnn.apply(linear_input.apply(activation_input))
activation_output = linear_output.apply(hidden)
y_est = softmax.apply(activation_output, extra_ndim=1)

cost = softmax.categorical_cross_entropy(y, activation_output,
                                         extra_ndim=1).mean()

from blocks.graph import ComputationGraph
from blocks.algorithms import GradientDescent, Adam

cg = ComputationGraph([cost])

step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)]
Exemplo n.º 38
0
 def __init__(self, initial_output=0, **kwargs):
     self.initial_output = initial_output
     self.softmax = NDimensionalSoftmax()
     children = [self.softmax] + kwargs.get('children', [])
     super(SoftmaxEmitter, self).__init__(children=children, **kwargs)
    def __init__(self, dim, emb_dim, readout_dims, num_input_words,
                 def_num_input_words, vocab, use_definitions, def_word_gating,
                 compose_type, coattention, def_reader, reuse_word_embeddings,
                 random_unk, **kwargs):
        self._vocab = vocab
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        self._coattention = coattention
        self._num_input_words = num_input_words
        self._use_definitions = use_definitions
        self._random_unk = random_unk
        self._reuse_word_embeddings = reuse_word_embeddings

        lookup_num_words = num_input_words
        if reuse_word_embeddings:
            lookup_num_words = max(num_input_words, def_num_input_words)
        if random_unk:
            lookup_num_words = vocab.size()

        # Dima: we can have slightly less copy-paste here if we
        # copy the RecurrentFromFork class from my other projects.
        children = []
        self._lookup = LookupTable(lookup_num_words, emb_dim)
        self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
        self._encoder_rnn = LSTM(dim, name='encoder_rnn')
        self._question_transform = Linear(dim, dim, name='question_transform')
        self._bidir_fork = Linear(3 * dim if coattention else 2 * dim,
                                  4 * dim,
                                  name='bidir_fork')
        self._bidir = Bidirectional(LSTM(dim), name='bidir')
        children.extend([
            self._lookup, self._encoder_fork, self._encoder_rnn,
            self._question_transform, self._bidir, self._bidir_fork
        ])

        activations = [Rectifier()] * len(readout_dims) + [None]
        readout_dims = [2 * dim] + readout_dims + [1]
        self._begin_readout = MLP(activations,
                                  readout_dims,
                                  name='begin_readout')
        self._end_readout = MLP(activations, readout_dims, name='end_readout')
        self._softmax = NDimensionalSoftmax()
        children.extend(
            [self._begin_readout, self._end_readout, self._softmax])

        if self._use_definitions:
            # A potential bug here: we pass the same vocab to the def reader.
            # If a different token is reserved for UNK in text and in the definitions,
            # we can be screwed.
            def_reader_class = eval(def_reader)
            def_reader_kwargs = dict(
                num_input_words=def_num_input_words,
                dim=dim,
                emb_dim=emb_dim,
                vocab=vocab,
                lookup=self._lookup if reuse_word_embeddings else None)
            if def_reader_class == MeanPoolReadDefinitions:
                def_reader_kwargs.update(dict(normalize=True, translate=False))
            self._def_reader = def_reader_class(**def_reader_kwargs)
            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              def_word_gating=def_word_gating,
                                              compose_type=compose_type)
            children.extend([self._def_reader, self._combiner])

        super(ExtractiveQAModel, self).__init__(children=children, **kwargs)

        # create default input variables
        self.contexts = tensor.lmatrix('contexts')
        self.context_mask = tensor.matrix('contexts_mask')
        self.questions = tensor.lmatrix('questions')
        self.question_mask = tensor.matrix('questions_mask')
        self.answer_begins = tensor.lvector('answer_begins')
        self.answer_ends = tensor.lvector('answer_ends')
        input_vars = [
            self.contexts, self.context_mask, self.questions,
            self.question_mask, self.answer_begins, self.answer_ends
        ]
        if self._use_definitions:
            self.defs = tensor.lmatrix('defs')
            self.def_mask = tensor.matrix('def_mask')
            self.contexts_def_map = tensor.lmatrix('contexts_def_map')
            self.questions_def_map = tensor.lmatrix('questions_def_map')
            input_vars.extend([
                self.defs, self.def_mask, self.contexts_def_map,
                self.questions_def_map
            ])
        self.input_vars = OrderedDict([(var.name, var) for var in input_vars])
Exemplo n.º 40
0
    def fit(self, trainset, retrain=True):
        batch_size = self.batch_size
        n_iter = self.n_iter
        look_ahead = self.look_ahead
        lr = self.lr
        b1 = self.b1
        b2 = self.b2
        epsilon = self.epsilon
        hidden_size = self.hidden_size
        activation_function = self.activation_function
        drop_rate = self.drop_rate
        weight_decay = self.weight_decay
        optimizer = self.optimizer
        std = self.std
        alpha = self.alpha
        polyak_mu = self.polyak_mu
        rating_category = self.rating_category
        item_num = self.item_num
        user_num = self.user_num
        trainset = self.load_dataset(which_set=['train'],
                                     sources=('input_ratings',
                                              'output_ratings', 'input_masks',
                                              'output_masks'))
        validset = self.load_dataset(which_set=['valid'],
                                     sources=('input_ratings',
                                              'output_ratings', 'input_masks',
                                              'output_masks'))

        train_loop_stream = ForceFloatX(data_stream=MovieLensTransformer(
            data_stream=Trainer_MovieLensTransformer(data_stream=DataStream(
                dataset=trainset,
                iteration_scheme=ShuffledScheme(trainset.num_examples,
                                                batch_size)))))

        valid_monitor_stream = ForceFloatX(data_stream=MovieLensTransformer(
            data_stream=DataStream(dataset=validset,
                                   iteration_scheme=ShuffledScheme(
                                       validset.num_examples, batch_size))))

        rating_freq = np.zeros((user_num, rating_category))
        init_b = np.zeros((user_num, rating_category))
        for batch in valid_monitor_stream.get_epoch_iterator():
            inp_r, out_r, inp_m, out_m = batch
            rating_freq += inp_r.sum(axis=0)

        log_rating_freq = np.log(rating_freq + 1e-8)
        log_rating_freq_diff = np.diff(log_rating_freq, axis=1)
        init_b[:, 1:] = log_rating_freq_diff
        init_b[:, 0] = log_rating_freq[:, 0]
        #     init_b = np.log(rating_freq / (rating_freq.sum(axis=1)[:, None] + 1e-8) +1e-8)  * (rating_freq>0)

        new_items = np.where(rating_freq.sum(axis=1) == 0)[0]
        self.new_items = new_items
        input_ratings = T.tensor3(name='input_ratings',
                                  dtype=theano.config.floatX)
        output_ratings = T.tensor3(name='output_ratings',
                                   dtype=theano.config.floatX)
        input_masks = T.matrix(name='input_masks', dtype=theano.config.floatX)
        output_masks = T.matrix(name='output_masks',
                                dtype=theano.config.floatX)

        input_ratings_cum = T.extra_ops.cumsum(input_ratings[:, :, ::-1],
                                               axis=2)[:, :, ::-1]

        #     hidden_size = [256]
        if activation_function == 'reclin':
            act = Rectifier
        elif activation_function == 'tanh':
            act = Tanh
        elif activation_function == 'sigmoid':
            act = Logistic
        else:
            act = Softplus
        layers_act = [act('layer_%d' % i) for i in range(len(hidden_size))]
        NADE_CF_model = tabula_NADE(activations=layers_act,
                                    input_dim0=user_num,
                                    input_dim1=rating_category,
                                    other_dims=hidden_size,
                                    batch_size=batch_size,
                                    weights_init=Uniform(std=0.05),
                                    biases_init=Constant(0.0))
        NADE_CF_model.push_initialization_config()
        dims = [user_num] + hidden_size + [user_num]
        linear_layers = [
            layer for layer in NADE_CF_model.children if 'linear' in layer.name
        ]
        assert len(linear_layers) == len(dims) - 1
        for i in range(len(linear_layers)):
            H1 = dims[i]
            H2 = dims[i + 1]
            width = 2 * np.sqrt(6) / np.sqrt(H1 + H2)
            #         std = np.sqrt(2. / dim)
            linear_layers[i].weights_init = Uniform(width=width)
        NADE_CF_model.initialize()
        NADE_CF_model.children[-1].parameters[-1].set_value(
            init_b.astype(theano.config.floatX))
        y = NADE_CF_model.apply(input_ratings_cum)
        y_cum = T.extra_ops.cumsum(y, axis=2)
        predicted_ratings = NDimensionalSoftmax().apply(y_cum, extra_ndim=1)
        d = input_masks.sum(axis=1)
        D = (input_masks + output_masks).sum(axis=1)
        cost, nll, nll_item_ratings, cost_ordinal_1N, cost_ordinal_N1, prob_item_ratings = rating_cost(
            y,
            output_ratings,
            input_masks,
            output_masks,
            D,
            d,
            alpha=alpha,
            std=std)
        cost.name = 'cost'

        cg = ComputationGraph(cost)
        if weight_decay > 0.0:
            all_weights = VariableFilter(roles=[WEIGHT])(cg.variables)
            l2_weights = T.sum([(W**2).sum() for W in all_weights])
            l2_cost = cost + weight_decay * l2_weights
            l2_cost.name = 'l2_decay_' + cost.name
            cg = ComputationGraph(l2_cost)
        if drop_rate > 0.0:
            dropped_layer = VariableFilter(roles=[INPUT],
                                           bricks=NADE_CF_model.children)(
                                               cg.variables)
            dropped_layer = [
                layer for layer in dropped_layer if 'linear' in layer.name
            ]
            dropped_layer = dropped_layer[1:]
            cg_dropout = apply_dropout(cg, dropped_layer, drop_rate)
        else:
            cg_dropout = cg
        training_cost = cg_dropout.outputs[0]
        lr0 = T.scalar(name='learning_rate', dtype=theano.config.floatX)
        input_list = [input_ratings, input_masks, output_ratings, output_masks]
        if optimizer == 'Adam':
            f_get_grad, f_update_parameters, shared_gradients = Adam_optimizer(
                input_list, training_cost, cg_dropout.parameters, lr0, b1, b2,
                epsilon)
        elif optimizer == 'Adadelta':
            f_get_grad, f_update_parameters, shared_gradients = Adadelta_optimizer(
                input_list, training_cost, cg_dropout.parameters, lr, epsilon)
        else:
            f_get_grad, f_update_parameters, shared_gradients = SGD_optimizer(
                input_list, training_cost, cg_dropout.parameters, lr0, b1)

        param_list = []
        [param_list.extend(p.parameters) for p in NADE_CF_model.children]
        f_update_polyak, shared_polyak = polyak(param_list, mu=polyak_mu)

        f_monitor = theano.function(inputs=[input_ratings],
                                    outputs=[predicted_ratings])
        nb_of_epocs_without_improvement = 0
        best_valid_error = np.Inf
        epoch = 0
        best_model = cp.deepcopy(NADE_CF_model)
        best_polyak = cp.deepcopy(shared_polyak)
        start_training_time = t.time()
        lr_tracer = []
        rate_score = np.array(list(range(1, rating_category + 1)), np.float32)
        best_epoch = -1
        while epoch < n_iter and nb_of_epocs_without_improvement < look_ahead:
            print('Epoch {0}'.format(epoch))
            epoch += 1
            start_time_epoch = t.time()
            cost_train = []
            squared_error_train = []
            n_sample_train = []
            cntt = 0
            train_time = 0
            for batch in train_loop_stream.get_epoch_iterator():

                inp_r, out_r, inp_m, out_m = batch
                train_t = t.time()
                cost_value = f_get_grad(inp_r, inp_m, out_r, out_m)
                train_time += t.time() - train_t
                #             pred_ratings = f_monitor(inp_r)
                if optimizer == 'Adadelta':
                    f_update_parameters()
                else:
                    f_update_parameters(lr)
                f_update_polyak()
                pred_ratings = f_monitor(inp_r)
                true_r = out_r.argmax(axis=2) + 1
                pred_r = (pred_ratings[0] *
                          rate_score[np.newaxis, np.newaxis, :]).sum(axis=2)
                pred_r[:, new_items] = 3
                mask = out_r.sum(axis=2)
                se = np.sum(np.square(true_r - pred_r) * mask)
                n = np.sum(mask)
                squared_error_train.append(se)
                n_sample_train.append(n)
                cost_train.append(cost_value)
                cntt += 1

            cost_train = np.array(cost_train).mean()
            squared_error_ = np.array(squared_error_train).sum()
            n_samples = np.array(n_sample_train).sum()
            train_RMSE = np.sqrt(squared_error_ / (n_samples * 1.0 + 1e-8))

            print('\tTraining   ...')
            print('Train     :', "RMSE: {0:.6f}".format(train_RMSE),
                  " Cost Error: {0:.6f}".format(cost_train),
                  "Train Time: {0:.6f}".format(train_time),
                  get_done_text(start_time_epoch))

            print('\tValidating ...', )
            start_time = t.time()
            squared_error_valid = []
            n_sample_valid = []
            valid_time = 0
            for batch in valid_monitor_stream.get_epoch_iterator():
                inp_r, out_r, inp_m, out_m = batch
                valid_t = t.time()
                pred_ratings = f_monitor(inp_r)
                valid_time += t.time() - valid_t
                true_r = out_r.argmax(axis=2) + 1
                pred_r = (pred_ratings[0] *
                          rate_score[np.newaxis, np.newaxis, :]).sum(axis=2)

                pred_r[:, new_items] = 3
                mask = out_r.sum(axis=2)
                se = np.sum(np.square(true_r - pred_r) * mask)
                n = np.sum(mask)
                squared_error_valid.append(se)
                n_sample_valid.append(n)

            squared_error_ = np.array(squared_error_valid).sum()
            n_samples = np.array(n_sample_valid).sum()
            valid_RMSE = np.sqrt(squared_error_ / (n_samples * 1.0 + 1e-8))
            print('Validation:', " RMSE: {0:.6f}".format(valid_RMSE),
                  "Valid Time: {0:.6f}".format(valid_time),
                  get_done_text(start_time))
            if valid_RMSE < best_valid_error:
                best_epoch = epoch
                nb_of_epocs_without_improvement = 0
                best_valid_error = valid_RMSE
                del best_model
                del best_polyak
                gc.collect()

                best_model = cp.deepcopy(NADE_CF_model)
                best_polyak = cp.deepcopy(shared_polyak)
                print('\n\n Got a good one')
            else:
                nb_of_epocs_without_improvement += 1
                if optimizer == 'Adadelta':
                    pass
                elif nb_of_epocs_without_improvement == look_ahead and lr > 1e-5:
                    nb_of_epocs_without_improvement = 0
                    lr /= 4
                    print("learning rate is now %s" % lr)
            lr_tracer.append(lr)

        print('\n### Training, n_layers=%d' % (len(hidden_size)),
              get_done_text(start_training_time))

        best_y = best_model.apply(input_ratings_cum)
        best_y_cum = T.extra_ops.cumsum(best_y, axis=2)
        best_predicted_ratings = NDimensionalSoftmax().apply(best_y_cum,
                                                             extra_ndim=1)
        self.f_monitor_best = theano.function(inputs=[input_ratings],
                                              outputs=[best_predicted_ratings])
        self.best_valid_error = best_valid_error
        self.best_epoch = best_epoch
        self.best_model = best_model
        self.best_polyak = best_polyak
Exemplo n.º 41
0
    MaxPooling((2, 2), name='MaxPol1'),
    Convolutional(filter_size=(1, 1), num_filters=1024, name='Convx3'),
    Rectifier(),
    MaxPooling((2, 2), name='MaxPol2'),
    Convolutional(filter_size=(1, 1), num_filters=2, name='Convx4'),
    Rectifier(),
])
conv_sequence1 = ConvolutionalSequence(conv_layers1,
                                       num_channels=512,
                                       image_size=(10, 10),
                                       weights_init=Orthogonal(),
                                       use_bias=False,
                                       name='ConvSeq3')
conv_sequence1.initialize()
out_soft1 = Flattener(name='Flatt1').apply(conv_sequence1.apply(out5))
predict1 = NDimensionalSoftmax(name='Soft1').apply(out_soft1)
cost1 = CategoricalCrossEntropy(name='Cross1').apply(
    y.flatten(), predict1).copy(name='cost1')

#SECOND SOFTMAX
conv_layers2 = list([
    MaxPooling((2, 2), name='MaxPol2'),
    Convolutional(filter_size=(1, 1), num_filters=128, name='Convx21'),
    Rectifier(),
    MaxPooling((2, 2), name='MaxPol11'),
    Convolutional(filter_size=(1, 1), num_filters=1024, name='Convx31'),
    Rectifier(),
    MaxPooling((2, 2), name='MaxPol21'),
    Convolutional(filter_size=(1, 1), num_filters=2, name='Convx41'),
    Rectifier(),
])
        H1 = dims[i]
        H2 = dims[i + 1]
        width = 2 * np.sqrt(6) / np.sqrt(H1 + H2)
        #         std = np.sqrt(2. / dim)
        linear_layers[i].weights_init = Uniform(width=width)

#     NADE_CF_model.children[0].weights_init = Constant(1)
#     NADE_CF_model.children[0].biases_init = Constant(1.5)
#     NADE_CF_model.children[1].weights_init = Constant(2)
#     NADE_CF_model.children[1].biases_init = Constant(2.5)
    NADE_CF_model.initialize()
    NADE_CF_model.children[-1].parameters[-1].set_value(
        init_b.astype(theano.config.floatX))
    y = NADE_CF_model.apply(input_ratings_cum)
    y_cum = T.extra_ops.cumsum(y, axis=2)
    predicted_ratings = NDimensionalSoftmax().apply(y_cum, extra_ndim=1)
    d = input_masks.sum(axis=1)
    D = (input_masks + output_masks).sum(axis=1)
    #     ratings = T.tensor3(name="ratings", dtype=theano.config.floatX)
    cost, nll, nll_item_ratings, cost_ordinal_1N, cost_ordinal_N1, prob_item_ratings = rating_cost(
        y,
        output_ratings,
        input_masks,
        output_masks,
        D,
        d,
        alpha=alpha,
        std=std)
    cost.name = 'cost'

    cg = ComputationGraph(cost)
Exemplo n.º 43
0
# ******************* Model *******************
recognizer = SimpleSpeechRecognizer(transition=transition,
                dims_transition=conf.dims_transition,
                num_features=num_features, num_classes=num_classes)

#recognizer = SpeechRecognizer(
#    num_features=num_features, dims_bottom=[],
#    dims_bidir=conf.dims_transition, dims_top=[num_classes],
#    bidir_trans=GatedRecurrent, bottom_activation=None)


# ******************* output *******************
y_hat = recognizer.apply(x,x_m)
y_hat.name = 'outputs'
y_hat_softmax = NDimensionalSoftmax().apply(y_hat, extra_ndim = y_hat.ndim - 2)
y_hat_softmax.name = 'outputs_softmax'

# there is a cost function for monitoring and for training, because one is more stable to compute
# gradients and seems also to be more memory efficient, but does not compute the true cost.
if conf.task=='CTC':
    cost_train = ctc.pseudo_cost(y, y_hat, y_m, x_m).mean()
    cost_train.name = "cost_train"
    
    cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean()
    cost_monitor.name = "cost_monitor"
elif conf.task=='framewise':
    cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m)
    cost_train.name='cost'
    cost_monitor = cost_train
else:
Exemplo n.º 44
0
class FRNNEmitter(AbstractEmitter, Initializable, Random):
    """An RNN emitter for the case of real outputs.
    Parameters
    ----------
    """

    def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs):

        super(FRNNEmitter, self).__init__(**kwargs)

        self.mlp = mlp
        self.target_size = target_size
        self.frame_size = frame_size
        self.k = k
        self.frnn_hidden_size = frnn_hidden_size
        self.const = const
        self.input_dim = self.mlp.output_dim

        self.frnn_step_size = frnn_step_size

        # adding a step if the division is not exact.
        self.number_of_steps = frame_size // frnn_step_size
        self.last_steps = frame_size % frnn_step_size
        if self.last_steps != 0:
            self.number_of_steps += 1

        self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu")
        self.sigma = MLP(
            activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma"
        )

        self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()

        self.frnn_initial_state = Linear(
            input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state"
        )

        # self.frnn_hidden = Linear(
        #    input_dim=frnn_hidden_size,
        #    output_dim=frnn_hidden_size,
        #    activation=Tanh(),
        #    name="frnn_hidden")

        self.frnn_activation = Tanh(name="frnn_activation")

        self.frnn_linear_transition_state = Linear(
            input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state"
        )

        self.frnn_linear_transition_input = Linear(
            input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input"
        )

        # self.frnn_linear_transition_output = Linear (
        #    input_dim = frnn_hidden_size,
        #    output_dim = self.rnn_hidden_dim,
        #    name="frnn_linear_transition_output")

        self.children = [
            self.mlp,
            self.mu,
            self.sigma,
            self.coeff,
            self.coeff2,
            self.frnn_initial_state,
            self.frnn_activation,
            self.frnn_linear_transition_state,
            self.frnn_linear_transition_input,
        ]

    @application
    def emit(self, readouts):
        """
        keep_parameters is True if mu,sigma,coeffs must be stacked and returned
        if false, only the result is given, the others will be empty list.

        """
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        results = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            mu = self.mu.apply(state)
            sigma = self.sigma.apply(state) + self.const
            coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            shape_result = coeff.shape
            shape_result = tensor.set_subtensor(shape_result[-1], self.frnn_step_size)
            ndim_result = coeff.ndim

            mu = mu.reshape((-1, self.frnn_step_size, self.k))
            sigma = sigma.reshape((-1, self.frnn_step_size, self.k))
            coeff = coeff.reshape((-1, self.k))

            sample_coeff = self.theano_rng.multinomial(pvals=coeff, dtype=coeff.dtype)
            idx = predict(sample_coeff, axis=-1)
            # idx = predict(coeff, axis = -1) use this line for using most likely coeff.

            # shapes (ls*bs)*(fs)
            mu = mu[tensor.arange(mu.shape[0]), :, idx]
            sigma = sigma[tensor.arange(sigma.shape[0]), :, idx]

            epsilon = self.theano_rng.normal(size=mu.shape, avg=0.0, std=1.0, dtype=mu.dtype)

            result = mu + sigma * epsilon  # *0.6 #reduce variance.
            result = result.reshape(shape_result, ndim=ndim_result)
            results.append(result)

            # if the total size does not correspond to the frame_size,
            # this removes the need for padding
            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result)
                )

        results = tensor.stack(results, axis=-1)
        results = tensor.flatten(results, outdim=results.ndim - 1)

        # truncate if not good size
        if self.last_steps != 0:
            results = results[tuple([slice(0, None)] * (results.ndim - 1) + [slice(0, self.frame_size)])]

        return results

    @application
    def cost(self, readouts, outputs):
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        inputs = outputs

        mus = []
        sigmas = []
        coeffs = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            freq_mu = self.mu.apply(state)
            freq_sigma = self.sigma.apply(state) + self.const
            freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k))
            freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k))
            freq_coeff = freq_coeff.reshape((-1, self.k))
            # mu,sigma: shape (-1,fs,k)
            # coeff: shape (-1,k)

            mus.append(freq_mu)
            sigmas.append(freq_sigma)
            coeffs.append(freq_coeff)

            index = self.frnn_step_size
            freq_inputs = inputs[
                tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)])
            ]

            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state)
                    + self.frnn_linear_transition_input.apply(freq_inputs)
                )

        mus = tensor.stack(mus, axis=-2)
        sigmas = tensor.stack(sigmas, axis=-2)
        coeffs = tensor.stack(coeffs, axis=-2)

        mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        coeffs = coeffs.repeat(self.frnn_step_size, axis=-2)

        mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        # actually prob not necessary
        mu = mus.reshape((-1, self.target_size))
        sigma = sigmas.reshape((-1, self.target_size))
        coeff = coeffs.reshape((-1, self.target_size))

        return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k)

    @application
    def initial_outputs(self, batch_size):
        return tensor.zeros((batch_size, self.frame_size), dtype=floatX)

    def get_dim(self, name):
        # modification here to ensure the right dim.
        if name == "outputs":
            return self.frame_size
        return super(FRNNEmitter, self).get_dim(name)
Exemplo n.º 45
0
 def __init__(self, *args, **kwargs):
     self.softmax = NDimensionalSoftmax()
     super(MinRiskInitialContextSequenceGenerator,
           self).__init__(*args, **kwargs)
     self.children.append(self.softmax)
Exemplo n.º 46
0
class ActorCriticReadout(SoftmaxReadout):
    """Actor-critic

    Params
    ------
    bos_token : int
        The token used to pad critic input. Critic needs to do
        at least one extra step compared to the actor in order
        to get the first glimpse of the ground-truth sequence
        before predicting the actual values.

    """
    def __init__(self,
                 reward_brick,
                 compute_targets,
                 compute_policy,
                 solve_bellman,
                 freeze_actor,
                 freeze_critic,
                 critic_uses_actor_states,
                 critic_uses_groundtruth,
                 critic=None,
                 critic_burnin_steps=None,
                 critic_policy_t=None,
                 entropy_reward_coof=None,
                 cross_entropy_reward_coof=None,
                 discount=None,
                 value_penalty=None,
                 value_softmax=False,
                 same_value_for_wrong=False,
                 accumulate_outputs=False,
                 use_value_biases=None,
                 actor_grad_estimate=None,
                 bos_token=None,
                 **kwargs):
        super(ActorCriticReadout, self).__init__(**kwargs)
        self.reward_brick = reward_brick
        self.critic = critic
        self.freeze_actor = freeze_actor
        self.freeze_critic = freeze_critic
        self.critic_uses_actor_states = critic_uses_actor_states
        self.critic_uses_groundtruth = (critic_uses_groundtruth
                                        if critic_uses_groundtruth is not None
                                        else True)
        self.critic_burnin_steps = (critic_burnin_steps
                                    if critic_burnin_steps is not None else 0)
        self.value_summand = Linear(output_dim=1, name='summand')
        self.softmax_t = 1.
        self.critic_policy_t = (critic_policy_t
                                if critic_policy_t is not None else 1.0)
        self.epsilon = 0.
        self.discount = (discount if discount is not None else 1.)
        self.entropy_reward_coof = (entropy_reward_coof
                                    if entropy_reward_coof is not None else 0.)
        self.cross_entropy_reward_coof = (cross_entropy_reward_coof
                                          if cross_entropy_reward_coof
                                          is not None else 0.)
        self.value_penalty = value_penalty
        self.value_softmax = value_softmax
        self.same_value_for_wrong = same_value_for_wrong
        self.compute_targets = compute_targets
        self.compute_policy = compute_policy
        self.solve_bellman = solve_bellman
        self.accumulate_outputs = accumulate_outputs
        self.use_value_biases = (use_value_biases
                                 if use_value_biases is not None else True)
        self.actor_grad_estimate = (actor_grad_estimate
                                    if actor_grad_estimate else 'all_actions')
        self.bos_token = bos_token
        self.softmax = NDimensionalSoftmax()
        self.children += [reward_brick, self.value_summand, self.softmax]
        if self.critic:
            self.children.append(self.critic)
        self.costs.inputs += ['attended', 'attended_mask']

    def _push_allocation_config(self):
        super(ActorCriticReadout, self)._push_allocation_config()
        self.value_summand.input_dim = self.get_dim('attended')

    @application
    def scores(self, **inputs):
        merged = self.merge(**dict_subset(inputs, self.merge_names))
        return self.softmax.log_probabilities(merged * self.softmax_t,
                                              extra_ndim=merged.ndim - 2)

    @application
    def costs(self, application_call, prediction, prediction_mask, groundtruth,
              groundtruth_mask, **inputs):
        def _prediction_subtensor(data):
            if data.ndim != 3:
                raise ValueError
            flat_data = data.reshape(
                (data.shape[0] * data.shape[1], data.shape[2]))
            flat_data = flat_data[tensor.arange(flat_data.shape[0]),
                                  prediction.flatten()]
            return flat_data.reshape(
                (prediction.shape[0], prediction.shape[1]))

        attended = disconnected_grad(inputs.pop('attended'))
        attended_mask = disconnected_grad(inputs.pop('attended_mask'))

        # Compute the rewards
        rewards = self.reward_brick.apply(prediction, prediction_mask,
                                          groundtruth, groundtruth_mask)[:, :,
                                                                         0]
        future_rewards = rewards[::-1].cumsum(axis=0)[::-1]

        # Compute the critic outputs
        if self.critic:
            padding = tensor.repeat(tensor.fill(prediction[0:1],
                                                self.bos_token),
                                    1,
                                    axis=0)
            mask_padding = tensor.repeat(tensor.fill(prediction_mask[0:1], 1.),
                                         1,
                                         axis=0)
            padded_prediction = tensor.concatenate([padding, prediction])
            padded_prediction_mask = tensor.concatenate(
                [mask_padding, prediction_mask])
            if self.critic_uses_groundtruth:
                critic_context = groundtruth
                critic_context_mask = groundtruth_mask
            else:
                critic_context = tensor.zeros_like(groundtruth[0:1])
                critic_context_mask = tensor.zeros_like(groundtruth_mask[0:1])
            critic_kwargs = dict(prediction=padded_prediction,
                                 prediction_mask=padded_prediction_mask,
                                 groundtruth=critic_context,
                                 groundtruth_mask=critic_context_mask,
                                 inputs=critic_context,
                                 inputs_mask=critic_context_mask)

            if self.critic_uses_actor_states:
                extra_inputs = disconnected_grad(inputs['states'])
                # We don't the very last hidden state of the actor
                # in extra_inputs. We have to add something instead for the shapes
                # to match. It doesn't matter at all, what exactly we add.
                critic_kwargs['extra_inputs'] = tensor.concatenate(
                    [extra_inputs,
                     tensor.zeros_like(extra_inputs[0:1])])
            critic_cg = ComputationGraph(self.critic.costs(**critic_kwargs))
            outputs, = VariableFilter(
                applications=[self.critic.generator.readout.all_outputs],
                roles=[OUTPUT])(critic_cg)
            # The first subtensor should be discarded, because it was outputted
            # for the padding. In addition to that Q-values from the first
            # 'critic_burnin_steps' will be ignored, see later in the code.
            outputs = outputs[1:]
        else:
            outputs = self.merge(**dict_subset(inputs, self.merge_names))
        prediction_outputs = _prediction_subtensor(outputs)

        # Compute Q adjustments
        adjustments = outputs
        prediction_adjustments = prediction_outputs
        if self.accumulate_outputs:
            prediction_adjustments = prediction_outputs.cumsum(axis=0)
            adjustments = tensor.inc_subtensor(
                adjustments[1:], prediction_adjustments[:-1][:, :, None])

        # Compute shared additive biases for all Q values
        if self.use_value_biases:
            value_biases = (self.value_summand.apply(attended)[:, :, 0] *
                            attended_mask).sum(axis=0)
        else:
            value_biases = tensor.zeros_like(adjustments[0, :, 0])
        values = adjustments + value_biases[None, :, None]
        prediction_values = prediction_adjustments + value_biases[None, :]

        rolled_prediction_mask = tensor.roll(prediction_mask, -1, axis=0)
        rolled_prediction_mask = tensor.set_subtensor(
            rolled_prediction_mask[-1], 0)

        # Compute probabilities
        logs = self.scores(use_epsilon=False, **inputs)
        probs = tensor.exp(logs)
        if not self.compute_policy:
            raise NotImplementedError("Not supported any more")
        prediction_logs = _prediction_subtensor(logs)

        # Compute value targets
        value_targets = (disconnected_grad(probs) * values).sum(axis=-1)
        value_targets = tensor.roll(value_targets, -1, axis=0)
        value_targets = (
            self.discount * value_targets * rolled_prediction_mask + rewards)
        value_targets = value_targets.astype(theano.config.floatX)

        total_costs = 0

        # Compute critic cost
        if not self.compute_targets:
            logger.debug("Using given targets")
            value_targets = tensor.matrix('value_targets')
        if self.solve_bellman == 'no':
            logger.debug("Not solving Bellman, just predicting the rewards")
            value_targets = rewards.copy(name='value_targets')
        elif self.solve_bellman == 'without_dp':
            future_rewards = rewards[::-1].cumsum(axis=0)[::-1]
            logger.debug("Solving Bellman, but without DP")
            value_targets = future_rewards
        elif self.solve_bellman is not True:
            raise ValueError()
        critic_costs_per_char = (
            (prediction_values - value_targets)**2) * prediction_mask
        critic_costs = critic_costs_per_char[self.critic_burnin_steps:].sum(
            axis=0)
        if not self.freeze_critic:
            total_costs += critic_costs

        # Compute critic Monte-Carlo cost
        critic_monte_carlo_costs = (
            (((prediction_values - future_rewards)**2) *
             prediction_mask)[self.critic_burnin_steps:].sum(axis=0))

        # Value penalty
        if self.value_penalty:
            logger.debug("Use value penalty")
            value_deviations = (values -
                                values.mean(axis=-1, keepdims=True))**2
            if not self.freeze_critic:
                total_costs += (
                    self.value_penalty *
                    (value_deviations.sum(axis=-1) *
                     prediction_mask)[self.critic_burnin_steps:].sum(axis=0))

        # Compute actor cost
        if self.critic:
            # The actor cost will be minimized, that's why values
            # must be negated.
            est_name = self.actor_grad_estimate
            if est_name == 'all_actions':
                disadvantages = disconnected_grad(
                    values.max(axis=-1)[:, :, None] - values)
                actor_costs = ((probs * disadvantages).sum(axis=-1) *
                               prediction_mask)
                actor_costs = actor_costs[self.critic_burnin_steps:]
            elif est_name.startswith('1_action'):
                # Here we do not provide a target for the first step for
                # the reason we lack an estimate of the value of the initial state.
                # This is how our critic works.
                # Hopefully the network won't unlearn
                # to produce a BOS first.
                future_reward_estimate = (future_rewards
                                          if est_name.endswith('unbiased') else
                                          prediction_values)
                weights = -disconnected_grad(future_reward_estimate[1:] +
                                             rewards[:-1] -
                                             prediction_values[:-1])
                actor_costs = ((prediction_logs[1:] * weights) *
                               prediction_mask[1:])
                actor_costs = actor_costs[self.critic_burnin_steps + 1:]
            else:
                raise ValueError
            actor_costs = actor_costs.sum(axis=0)

            actor_entropies = (probs * -logs).sum(axis=-1) * prediction_mask
            actor_entropies = actor_entropies[self.critic_burnin_steps:].sum(
                axis=0)
            critic_policy = disconnected_grad(
                self.softmax.apply(self.critic_policy_t * values,
                                   extra_ndim=1))
            critic_cross_entropies = ((critic_policy * -logs).sum(axis=-1) *
                                      prediction_mask)
            critic_cross_entropies = critic_cross_entropies[
                self.critic_burnin_steps:].sum(axis=0)
            actor_costs_with_penalties = (
                actor_costs - self.entropy_reward_coof * actor_entropies -
                self.cross_entropy_reward_coof * critic_cross_entropies)
            if not self.freeze_actor:
                total_costs += actor_costs_with_penalties
            else:
                total_costs += disconnected_grad(actor_costs_with_penalties)

        # Add auxiliary variables for intermediate steps of the computation
        application_call.add_auxiliary_variable(rewards, name='rewards')
        application_call.add_auxiliary_variable(value_biases,
                                                name='value_biases')
        application_call.add_auxiliary_variable(values.copy(), name='values')
        application_call.add_auxiliary_variable(outputs.copy(), name='outputs')
        application_call.add_auxiliary_variable(prediction_values,
                                                name='prediction_values')
        application_call.add_auxiliary_variable(prediction_outputs,
                                                name='prediction_outputs')
        application_call.add_auxiliary_variable(value_targets.copy(),
                                                name='value_targets')
        application_call.add_auxiliary_variable(probs.copy(), name='probs')
        application_call.add_auxiliary_variable(prediction_logs,
                                                name='prediction_log_probs')

        # Compute some statistics for debugging
        last_character_mask = prediction_mask - rolled_prediction_mask
        last_character_costs = (critic_costs_per_char *
                                last_character_mask).sum(axis=0)
        mean2_output = (((prediction_outputs**2) * prediction_mask).sum() /
                        prediction_mask.sum())**0.5
        max_output = abs(prediction_outputs * prediction_mask).max()
        expected_reward = (probs[0] * values[0]).sum(axis=-1)
        application_call.add_auxiliary_variable(last_character_costs,
                                                name='last_character_costs')
        application_call.add_auxiliary_variable(critic_costs.mean(),
                                                name='mean_critic_cost')
        application_call.add_auxiliary_variable(
            critic_monte_carlo_costs.mean(),
            name='mean_critic_monte_carlo_cost')
        if self.critic:
            application_call.add_auxiliary_variable(actor_costs.mean(),
                                                    name='mean_actor_cost')
            application_call.add_auxiliary_variable(actor_entropies.mean(),
                                                    name='mean_actor_entropy')
        application_call.add_auxiliary_variable(expected_reward.mean(),
                                                name='mean_expected_reward')
        application_call.add_auxiliary_variable(mean2_output,
                                                name='mean2_output')
        application_call.add_auxiliary_variable(max_output, name='max_output')

        return total_costs
Exemplo n.º 47
0
 def __init__(self, initial_output=0, **kwargs):
     super(SoftmaxEmitter, self).__init__(**kwargs)
     self.initial_output = initial_output
     self.softmax = NDimensionalSoftmax()
     self.children = [self.softmax]
Exemplo n.º 48
0
 def __init__(self,
              reward_brick,
              compute_targets,
              compute_policy,
              solve_bellman,
              freeze_actor,
              freeze_critic,
              critic_uses_actor_states,
              critic_uses_groundtruth,
              critic=None,
              critic_burnin_steps=None,
              critic_policy_t=None,
              entropy_reward_coof=None,
              cross_entropy_reward_coof=None,
              discount=None,
              value_penalty=None,
              value_softmax=False,
              same_value_for_wrong=False,
              accumulate_outputs=False,
              use_value_biases=None,
              actor_grad_estimate=None,
              bos_token=None,
              **kwargs):
     super(ActorCriticReadout, self).__init__(**kwargs)
     self.reward_brick = reward_brick
     self.critic = critic
     self.freeze_actor = freeze_actor
     self.freeze_critic = freeze_critic
     self.critic_uses_actor_states = critic_uses_actor_states
     self.critic_uses_groundtruth = (critic_uses_groundtruth
                                     if critic_uses_groundtruth is not None
                                     else True)
     self.critic_burnin_steps = (critic_burnin_steps
                                 if critic_burnin_steps is not None else 0)
     self.value_summand = Linear(output_dim=1, name='summand')
     self.softmax_t = 1.
     self.critic_policy_t = (critic_policy_t
                             if critic_policy_t is not None else 1.0)
     self.epsilon = 0.
     self.discount = (discount if discount is not None else 1.)
     self.entropy_reward_coof = (entropy_reward_coof
                                 if entropy_reward_coof is not None else 0.)
     self.cross_entropy_reward_coof = (cross_entropy_reward_coof
                                       if cross_entropy_reward_coof
                                       is not None else 0.)
     self.value_penalty = value_penalty
     self.value_softmax = value_softmax
     self.same_value_for_wrong = same_value_for_wrong
     self.compute_targets = compute_targets
     self.compute_policy = compute_policy
     self.solve_bellman = solve_bellman
     self.accumulate_outputs = accumulate_outputs
     self.use_value_biases = (use_value_biases
                              if use_value_biases is not None else True)
     self.actor_grad_estimate = (actor_grad_estimate
                                 if actor_grad_estimate else 'all_actions')
     self.bos_token = bos_token
     self.softmax = NDimensionalSoftmax()
     self.children += [reward_brick, self.value_summand, self.softmax]
     if self.critic:
         self.children.append(self.critic)
     self.costs.inputs += ['attended', 'attended_mask']