Пример #1
0
def KS_distance(expected, observed, mode='D'):
    """
  A symbolic theano expression for Kolmogorov-Smirnov statistical
  distance.

  Note: this implementation uses `cumsum`.
  Theano implementation of `cumsum` falls back to numpy one,
  thus no accelerated code will be generated.

  :param expected: 1D tensor, expectation or canonical distribution.
  :param observed: 2D tensor, first dimension is spatial,
    the second one represents probabilities of empirical distribution.
  :param mode: possible modes:
    D - two-sided KS distance,
    D- and D+ - one sided KS distance.
  :return: symbolic expression for Kolmogorov-Smirnov distance.
  """
    expected_ecpf = T.cumsum(expected)
    observed_ecpf = T.cumsum(observed, axis=1)

    if mode == 'D':
        difference = abs(observed_ecpf - expected_ecpf[None, :])
    elif mode == 'D+':
        difference = T.maximum(observed_ecpf - expected_ecpf[None, :],
                               np.float32(0.0))
    elif mode == 'D-':
        difference = T.maximum(expected_ecpf[None, :] - observed_ecpf,
                               np.float32(0.0))
    else:
        raise Exception('Unknown mode for KS distance: %s' % mode)

    return T.max(difference, axis=1)
Пример #2
0
def findalpha2(D, W):
    W = T.flatten(W)
    D = T.flatten(D)
    # the positive  part
    n1 = T.sum(T.gt(W, 0.))
    ind1 = T.argsort(W)[::-1]
    cum_DW1 = T.cumsum(T.abs_(D * W)[ind1])
    cum_D1 = T.cumsum(D[ind1])
    c1 = cum_DW1 / cum_D1 / 2  # tmp = W[ind] - cum_DW_D
    mask1 = T.lt(
        (W[ind1][0:n1 - 1] - c1[0:n1 - 1]) * (W[ind1][1:n1] - c1[0:n1 - 1]), 0)
    thr1 = c1[mask1.nonzero()][T.argmax(
        c1[mask1.nonzero()] * c1[mask1.nonzero()] * cum_D1[mask1.nonzero()])]
    from theano.ifelse import ifelse
    thres1 = ifelse(T.gt(mask1.nonzero()[0].shape[0], 0), thr1,
                    0.7 * c1[n1 - 1])
    # the negative part
    n2 = T.sum(T.lt(W, 0.))
    ind2 = ind1[::-1]
    cum_DW2 = T.cumsum(T.abs_(D * W)[ind2])
    cum_D2 = T.cumsum(D[ind2])
    c2 = cum_DW2 / cum_D2 / 2  # tmp = W[ind] - cum_DW_D
    mask2 = T.lt(
        (-W[ind2][0:n2 - 1] - c2[0:n2 - 1]) * (-W[ind2][1:n2] - c2[0:n2 - 1]),
        0)
    thr2 = c2[mask2.nonzero()][T.argmax(
        c1[mask2.nonzero()] * c1[mask2.nonzero()] * cum_D2[mask2.nonzero()])]
    from theano.ifelse import ifelse
    thres2 = ifelse(T.gt(mask2.nonzero()[0].shape[0], 0), thr2,
                    0.7 * c2[n2 - 1])
    return thres1, thres2
Пример #3
0
    def config_beta_updates(self, beta_lr=1e-3):

        # determine fraction of up-moving particles at each temperature
        fup = self.nup / (self.nup + self.ndown)
        self.get_fup = theano.function([], fup)

        # cost function is: C = \sum_i (fup_{i+1} - fup_i) ^ 2
        # \frac{dC}{d \lambda_i} = \sum_{j >= i}
        #    \frac{dC}{df_j}
        #    \frac{df_j}{d\beta_j}
        #    \frac{d\beta_j}{d \delta \beta_i      = -1
        #    \frac{d \delta \beta_i}{d\lambda_i}   = exp(\lambda_i)

        # vectors of length n_beta-3
        f_i = fup[1:-1]
        f_im1 = fup[:-2]
        f_ip1 = fup[2:]
        
        ## \frac{dC}{df_j} ##
        # vector of length n_beta-2
        dc_df = 2*(f_i - f_im1) - 2*(f_ip1 - f_i)

        ## \frac{df_j}{d\beta_j} : estimate it from empirical data ##
        # vector of length n_beta-1
        df_db = (fup[1:] - fup[:-1]) / (self.betas[1:] - self.betas[:-1] + 1e-3)
        # vector of length n_beta-2
        df_db_avg = (df_db[1:] + df_db[:-1])/2.

        dc_dlambda = T.cumsum(dc_df * df_db_avg * -1 * T.exp(self.lambdas))

        # gradient-based beta update
        new_lambdas = self.lambdas - beta_lr * dc_dlambda

        updates = {self._lambdas: T.set_subtensor(self._lambdas[:self.n_beta-2], new_lambdas)}
        self.grad_update_betas = theano.function([], new_lambdas, updates=updates)
def ewma(series, axis=None, span=VOL_WINDOW_SPAN, adjust=True, initial=None):
    """
    Exponentially-weighted moving average
    """
    if axis is None:
        if series.ndim == 1:
            axis=0
        else:
            raise ValueError("Please specify which axis to compute ewma over (usually time axis)") 
    assert span >= 1
    alpha = 2. / (span + 1) 
    series = T.swapaxes(series, axis, 0)
    if adjust:
        assert initial is None
        initial = T.zeros_like(series[0])
    else:
        if initial is None:
            initial = series[0]
        initial /= alpha
 
    def ewma_numerator_step(a_i, prev_ewma):
        return a_i + (1. - alpha) * prev_ewma
 
    ewma_numerators, _ = theano.scan(ewma_numerator_step, series,
                                     outputs_info=initial, strict=True)
 
    if adjust:
        ewma_denominators = T.cumsum((1 - alpha) ** T.arange(ewma_numerators.shape[0]))
        series_ewma = ewma_numerators / ewma_denominators.reshape((-1,)+(1,)*(ewma_numerators.ndim-1))
    else:
        series_ewma = ewma_numerators * alpha 
    series_ewma = T.swapaxes(series_ewma, 0, axis)
    return series_ewma
Пример #5
0
def sample_from_distribution(p, srng):
    assert p.ndim == 2
    cs = T.cumsum(p, axis=1)
    rnd = srng.uniform(low=0., high=1., dtype='float32', size=(p.shape[0],))
    sel = T.sum(T.gt(rnd.dimshuffle((0, 'x')), cs), axis=1)
    sel = T.clip(sel, 0, p.shape[1] - 1)
    return T.cast(sel, 'int32')
Пример #6
0
 def log_likelihood_sym_1traj_GPOMDP(self, x_var, dist_info_vars):
     means = dist_info_vars["mean"]
     log_stds = dist_info_vars["log_std"]
     zs = (x_var - means) / TT.exp(log_stds)
     return TT.cumsum(- TT.sum(log_stds, axis=-1) - \
            0.5 * TT.sum(TT.square(zs), axis=-1) - \
            0.5 * means.shape[-1] * np.log(2 * np.pi))
Пример #7
0
    def logp(self, x):
        n = self.n
        eta = self.eta

        diag_idxs = self.diag_idxs
        cumsum = tt.cumsum(x**2)
        variance = tt.zeros(n)
        variance = tt.inc_subtensor(variance[0], x[0]**2)
        variance = tt.inc_subtensor(
            variance[1:], cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]])
        sd_vals = tt.sqrt(variance)

        logp_sd = self.sd_dist.logp(sd_vals).sum()
        corr_diag = x[diag_idxs] / sd_vals

        logp_lkj = (2 * eta - 3 + n - tt.arange(n)) * tt.log(corr_diag)
        logp_lkj = tt.sum(logp_lkj)

        # Compute the log det jacobian of the second transformation
        # described in the docstring.
        idx = tt.arange(n)
        det_invjac = tt.log(corr_diag) - idx * tt.log(sd_vals)
        det_invjac = det_invjac.sum()

        norm = _lkj_normalizing_constant(eta, n)

        return norm + logp_lkj + logp_sd + det_invjac
    def scan_gen(self, rng, x0, *params):
        # sequences, outputs, non_sequences
        idx = 0
        h0s = params[idx:idx + len(self.lstms)]
        idx += len(self.lstms)

        xembed = params[idx]
        idx += 1
        yw = params[idx]
        idx += 1
        yb = params[idx]
        idx += 1

        xe = xembed[x0, :]
        y0 = xe
        h1s = []
        for i, lstm in enumerate(self.lstms):
            p = params[idx:idx + len(lstm.recurrent_params)]
            idx += len(lstm.recurrent_params)
            h1, y1 = lstm.step(xs=[y0], h0=h0s[i], params=p)
            h1s.append(h1)
            y0 = y1
        p1 = softmax_nd(T.dot(y0, yw) + yb)
        cs = T.cumsum(p1, axis=1)
        x1 = T.sum(T.gt(rng.dimshuffle((0, 'x')), cs), axis=1)
        x1 = T.clip(x1, 0, cs.shape[1] - 1)
        x1 = T.cast(x1 + 1, 'int32')
        assert idx == len(params)
        return [x1] + h1s
def reportDelayDistFunc(cases,mu1,sig1,mu2,sig2,r,n):
    m1 = tt.cast(mu1,'float64')
    s1 = tt.cast(sig1,'float64')
    m2 = tt.cast(mu2,'float64')
    s2 = tt.cast(sig2,'float64')
    sr = tt.cast(r,'float64')
    n = tt.cast(n,'int64')
    x = tt.arange(1,n+1)
    
    # Prepare the Distributions
    sr = tt.clip(r,1e-12,1-1e-12)
    d1 = tt_lognormal(x,tt.log(m1),s1)
    d2 = tt_lognormal(x,tt.log(m2),s2)
    
    print(d1,d2)

    d1 = tt.alloc(d1,1,d1.shape[0])
    d2 = tt.alloc(d2,1,d2.shape[0])
    # Prepare cases as diagonal of matrix
    cin = tt.cast(cases,'float64')
    c2d = tt.nlinalg.alloc_diag(cin)
    # Create a Vector
    
    cf1 = tt.signal.conv.conv2d(c2d,d1,border_mode='full')
    cf2 = tt.signal.conv.conv2d(c2d,d2,border_mode='full')
    
    cfo = (sr*cf1.T + (tt.ones_like(sr)-sr)*cf2.T).T
    reported = tt.cumsum(cfo,axis=1)
    return reported,cfo#,dists
Пример #10
0
def stick_breaking_log(u):
    """Return log of weights from stick-breaking process."""
    lu = tns.concatenate((tns.log(u), [0.0]))
    cs = tns.concatenate(([0.0], tns.cumsum(tns.log1p(-u))))
    lw = lu + cs

    return lw
Пример #11
0
    def __init__(self, n_classes, n_features):
        #aprendizado por linha
        X = T.dvector('x')
        #classe de saída é um inteiro
        Y = T.iscalar('y')

        W = theano.shared(np.zeros((n_classes, n_features)))

        self.params = [W]

        z = T.dot(X, W.T)  #(n_samples, n_classes)
        scores = T.cumsum(z)  #(n_samples, n_classes)
        output = T.argmax(scores)  #n_samples integers

        self.pred = theano.function([X], output)

        #Loss function
        L = T.sum(scores[output] - scores[Y])
        #error count
        err = T.sum(T.neq(output, Y))

        #compute gradient
        gW = T.grad(L, W)

        #update
        updates = [(W, W - gW)]
        self.train = theano.function([X, Y], [L, err], updates=updates)

        self.err = theano.function([X, Y], err)
Пример #12
0
    def get_output_for(self, policy, greedy=False, **kwargs):
        if greedy:
            # greedy branch
            chosen_action_ids = T.argmax(policy,
                                         axis=-1).astype(self.output_dtype)

        else:

            if self.assume_normalized:
                probas = policy
            else:
                probas = policy / T.sum(policy, axis=-1, keepdims=True)

            # p1, p1+p2, p1+p2+p3, ... 1
            cum_probas = T.cumsum(probas, axis=-1)

            rnd_shape = T.stack([*policy.shape[:-1], 1])

            batch_randomness = self.rng.uniform(low=0.,
                                                high=1.,
                                                size=rnd_shape)
            batch_randomness = T.repeat(batch_randomness,
                                        policy.shape[-1] - 1,
                                        axis=-1)

            chosen_action_ids = T.sum(
                (batch_randomness > cum_probas[:, :, :-1]),
                axis=-1,
                dtype=self.output_dtype)

        return chosen_action_ids
Пример #13
0
    def logp(self, x):
        n = self.n
        eta = self.eta

        diag_idxs = self.diag_idxs
        cumsum = tt.cumsum(x ** 2)
        variance = tt.zeros(n)
        variance = tt.inc_subtensor(variance[0], x[0] ** 2)
        variance = tt.inc_subtensor(
            variance[1:],
            cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]])
        sd_vals = tt.sqrt(variance)

        logp_sd = self.sd_dist.logp(sd_vals).sum()
        corr_diag = x[diag_idxs] / sd_vals

        logp_lkj = (2 * eta - 3 + n - tt.arange(n)) * tt.log(corr_diag)
        logp_lkj = tt.sum(logp_lkj)

        # Compute the log det jacobian of the second transformation
        # described in the docstring.
        idx = tt.arange(n)
        det_invjac = tt.log(corr_diag) - idx * tt.log(sd_vals)
        det_invjac = det_invjac.sum()

        norm = _lkj_normalizing_constant(eta, n)

        return norm + logp_lkj + logp_sd + det_invjac
Пример #14
0
 def compute_output(self, network, in_vw):
     axis = network.find_hyperparameter(["axis"])
     network.create_vw(
         "default",
         variable=T.cumsum(in_vw.variable, axis=axis),
         shape=in_vw.shape,
         tags={"output"}
     )
Пример #15
0
 def mask_for_prediction(self, prediction):
     prediction_mask = tensor.lt(
         tensor.cumsum(tensor.eq(prediction, self.eos_label)
                       .astype(theano.config.floatX), axis=0),
         1).astype(theano.config.floatX)
     prediction_mask = tensor.roll(prediction_mask, 1, 0)
     prediction_mask = tensor.set_subtensor(
         prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :]))
     return prediction_mask
Пример #16
0
def get_mask_by_eos(is_eos):
    """takes indicator of "it ends now", returns mask. Ignores everything after first end.
    :param is_eos: indicator that is 0 for all
    :type is_eos: theano.matrix
    """
    assert is_eos.ndim==2
    is_right_after_eos = T.concatenate([T.zeros_like(is_eos[:,:1]),is_eos[:,:-1]],-1)
    is_after_eos = T.eq(T.cumsum(is_right_after_eos,axis=-1),0).astype('uint8')
    return is_after_eos
Пример #17
0
def cumulative_sum(tensor, axis=-1):
    """
    Keras' backend does not have tf.cumsum().  We're adding it here.
    """
    if K.backend() == 'tensorflow':
        import tensorflow as tf
        return tf.cumsum(tensor, axis=axis)
    else:
        import theano.tensor as T
        return T.cumsum(tensor, axis=axis)
    def mixed_generate(self, return_initial_states=True, **kwargs):
        critic = self.generator.readout.critic
        groundtruth = kwargs.pop('groundtruth')
        groundtruth_mask = kwargs.pop('groundtruth_mask')
        step = kwargs.pop('step')

        sampling_inputs = dict_subset(
            kwargs, self.generator.readout.sample.inputs)
        actor_scores = self.generator.readout.scores(**sampling_inputs)

        critic_inputs = {
            name: kwargs['critic_' + name]
            for name in critic.generator.readout.merge_names}
        critic_outputs = critic.generator.readout.outputs(
            groundtruth, groundtruth_mask, **critic_inputs)

        epsilon = numpy.array(self.generator.readout.epsilon,
                              dtype=theano.config.floatX)
        actor_probs = tensor.exp(actor_scores)
        # This is a poor man's 1-hot argmax
        critic_probs = self.softmax.apply(critic_outputs * 1000)
        probs = (actor_probs * (tensor.constant(1) - epsilon)
                 + critic_probs * epsilon)

        x = self.theano_rng.uniform(size=(probs.shape[0],))
        samples = (tensor.gt(x[:, None], tensor.cumsum(probs, axis=1))
                   .astype(theano.config.floatX)
                   .sum(axis=1)
                   .astype('int64'))
        samples = tensor.minimum(samples, probs.shape[1] - 1)

        actor_feedback = self.generator.feedback.apply(samples, as_dict=True)
        actor_states_contexts = dict_subset(
            kwargs,
            self.generator.recurrent.apply.states
            + self.generator.recurrent.apply.contexts)
        actor_states_outputs = self.generator.recurrent.apply(
            as_dict=True, iterate=False,
            **dict_union(actor_feedback, actor_states_contexts))

        critic_feedback = critic.generator.feedback.apply(samples, as_dict=True)
        critic_states_contexts = {
            name: kwargs['critic_' + name]
            for name in
            critic.generator.recurrent.apply.states
            + critic.generator.recurrent.apply.contexts}
        critic_apply_kwargs = dict(
            as_dict=True, iterate=False,
            **dict_union(critic_feedback, critic_states_contexts))
        if self.generator.readout.critic_uses_actor_states:
            critic_apply_kwargs['extra_inputs'] = actor_states_outputs['states']
        critic_states_outputs = critic.generator.recurrent.apply(**critic_apply_kwargs)
        return ([samples, step + 1]
                + actor_states_outputs.values()
                + critic_states_outputs.values())
Пример #19
0
def get_mask_by_eos(is_eos):
    """takes indicator of "it ends now", returns mask. Ignores everything after first end.
    :param is_eos: indicator that is 0 for all
    :type is_eos: theano.matrix
    """
    assert is_eos.ndim == 2
    is_right_after_eos = T.concatenate(
        [T.zeros_like(is_eos[:, :1]), is_eos[:, :-1]], -1)
    is_after_eos = T.eq(T.cumsum(is_right_after_eos, axis=-1),
                        0).astype('uint8')
    return is_after_eos
Пример #20
0
    def _get_hidden_layer_connectivity(self, layerIdx):
        layer_size = self._hidden_sizes[layerIdx]
        if layerIdx == 0:
            p_vals = self._get_p(T.min(self.layers_connectivity[layerIdx]))
        else:
            p_vals = self._get_p(T.min(self.layers_connectivity_updates[layerIdx-1]))

        # #Implementations of np.choose in theano GPU
        # return T.nonzero(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX))[1].astype(dtype=theano.config.floatX)
        # return T.argmax(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX), axis=1)
        return T.sum(T.cumsum(self._mrng.multinomial(pvals=T.tile(p_vals[::-1][None, :], (layer_size, 1)), dtype=theano.config.floatX), axis=1), axis=1)
Пример #21
0
    def _calc_rewards(self, symbolic_batch):
        assert symbolic_batch.ndim == 2
        rewards = T.eq(self.target_idxs_shared[None, None, :], symbolic_batch[:, :, None]).any(-1)
        rewards = T.cast(rewards, 'int32')
        assert rewards.ndim == 2

        # Find EOS_ix in batch
        done_mask = T.eq(symbolic_batch, self.vocab.EOS_ix)
        # Set done==True for all words after EOS_ix
        done_mask = T.concatenate([T.zeros_like(done_mask[:, :1]), done_mask[:, :-1]], axis=1)

        is_alive = T.eq(T.cumsum(done_mask, axis=1), 0).astype('uint8')
        return -rewards, is_alive
Пример #22
0
    def pool(self, inputs):
        '''Convert the inputs into a fractionally max-pooled output tensor.
        Implementation adapted from ebenolson:
        https://github.com/Lasagne/Lasagne/pull/171
        '''
        _, _, n_in0, n_in1 = self.input_shape

        n_out0 = fractional_conv_output_length(n_in0, self.pool_size[0])
        n_out1 = fractional_conv_output_length(n_in1, self.pool_size[1])

        # Variable stride across the input creates fractional reduction.
        a = theano.shared(
            np.array([2] * (n_in0 - n_out0) + [1] * (2 * n_out0 - n_in0)))
        b = theano.shared(
            np.array([2] * (n_in1 - n_out1) + [1] * (2 * n_out1 - n_in1)))

        # Randomize the input strides.
        a = theano_shuffled(a)
        b = theano_shuffled(b)

        # Convert to input positions, starting at 0.
        a = T.concatenate(([0], a[:-1]))
        b = T.concatenate(([0], b[:-1]))
        a = T.cumsum(a)
        b = T.cumsum(b)

        # Positions of the other corners.
        c = T.clip(a + 1, 0, n_in0 - 1)
        d = T.clip(b + 1, 0, n_in1 - 1)

        # Index the four positions in the pooling window and stack them.
        temp = T.stack(inputs[:, :, a, :][:, :, :,
                                          b], inputs[:, :, c, :][:, :, :, b],
                       inputs[:, :, a, :][:, :, :,
                                          d], inputs[:, :, c, :][:, :, :, d])

        out = T.max(temp, axis=0)

        return out
Пример #23
0
def sample_categorical(rng, p, axis=-1, values=None):
    """
    p is a n-d array, where the final dimension is a discrete distibution (does not need to be normalized).
    Sample from that distribution.
    This will return an array of shape p.shape[:-1] with values in range [0, p.shape[-1])

    :param rng: A theano shared_randomstreams.RandomStream object
    :param p: An ndarray of arbitrary shape, where the values along (axis) are interpreted as an unnormalized
        discrete probability distribution (so if p.shape[2]==5, it means that the variable can take on 5 possible
        values).
    :param axis: The axis which we consider to be the distribution (only -1 (last axis)) supported now.
    :param values: The values of the variable.  len(values) must equal p.shape[axis].  If not included, the
        values will be considered to be integers in range(0, p.shape[axis])
    """
    # TODO: assert no negative values in p / assert p normalized along axis instead of dividing
    # TODO: assert len(values) == p.shape[axis]
    assert axis == -1, 'Currenly you can only sample along the last axis.'
    p = p / tt.sum(p, axis=axis, keepdims=True)
    # TODO: Check that differnt RNGs are doing the same thing!

    if isinstance(rng, TensorVariable):
        # Externally generated random numbers - we receive the maximum number of uniform random numbers
        # we could need, and then generate samplews from thos.
        old_p_shape = p.shape
        random_numbers = rng[:p.size].reshape((p.size, 1))
        cumulative_prob_mass = tt.cumsum(p.reshape((-1, p.shape[-1])), axis=1)
        samples = random_numbers < cumulative_prob_mass
        samples.reshape(old_p_shape)
    elif isinstance(rng, MRG_RandomStreams):
        # MRG_RandomStreams is faster but only works for 2-d pvals, so we have to reshape and
        # then unreshape.
        old_p_shape = p.shape
        samples = rng.multinomial(n=1, pvals=p.reshape((-1, p.shape[-1])))
        samples = samples.reshape(old_p_shape)
    elif isinstance(rng, CURAND_RandomStreams):
        # TODO: Make this work if possible - problem now is it needs to know shape in advance
        raise NotImplementedError("Curand doesn't work yet.")
        cumulative_prob_mass = np.cumsum(p, axis=axis)
        samples = rng.uniform(
            size=tt.set_subtensor(p.shape[axis], 1)) > cumulative_prob_mass
    else:
        samples = tt.switch(
            tt.eq(p.size, 0), tt.zeros(p.shape),
            rng.multinomial(n=1, pvals=tt.switch(tt.eq(p.size, 0), 1, p)))

    indices = tt.argmax(
        samples, axis=-1
    )  # Argmax is just a way to find the location of the only element that is 1.
    if values is not None:
        return values[indices]
    return indices
 def mask_for_prediction(self, prediction, groundtruth_mask=None,
                         extra_generation_steps=None):
     prediction_mask = tensor.lt(
         tensor.cumsum(tensor.eq(prediction, self.eos_label)
                       .astype(theano.config.floatX), axis=0),
         1).astype(theano.config.floatX)
     prediction_mask = tensor.roll(prediction_mask, 1, 0)
     prediction_mask = tensor.set_subtensor(
         prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :]))
     if groundtruth_mask:
         max_lengths = groundtruth_mask.sum(axis=0) + extra_generation_steps
         prediction_mask *= tensor.lt(
             tensor.arange(prediction.shape[0])[:, None], max_lengths[None, :])
     return prediction_mask
Пример #25
0
    def log_z_given_v(self, v):
        Wx_plusb = T.dot(v, self.W.T) + self.b

        energies = T.nnet.softplus(Wx_plusb)  # Sum over h'

        if self.penalty == "softplus_bi":
            energies -= self.beta*T.nnet.softplus(self.b)  # Add penality term
        elif self.penalty == "softplus0":
            energies -= self.beta*T.nnet.softplus(0)  # Add penality term
        else:
            raise NameError("Invalid penalty term")

        energies = T.cumsum(energies, axis=1)   # Cumsum over z
        return energies
Пример #26
0
    def _get_hidden_layer_connectivity(self, layerIdx):
        layer_size = self._hidden_sizes[layerIdx]
        if layerIdx == 0:
            lc = self.layers_connectivity[layerIdx]
            p_vals = self._get_p(T.min(lc))
        else:
            lc = self.layers_connectivity_updates[layerIdx-1]
            p_vals = self._get_p(T.min(lc))

        return T.sum(
            T.cumsum(self._mrng.multinomial(
            pvals=T.tile(p_vals[::-1][None, :],(layer_size, 1)), 
            dtype=floatX), axis=1), axis=1
        )
Пример #27
0
def findalpha(D, W):
    W = T.abs_(T.flatten(W))
    D = T.flatten(D)
    #     sorted_W = T.sort(W)[::-1]
    ind = T.argsort(W)[::-1]
    cum_DW = T.cumsum(T.abs_(D * W)[ind])
    cum_D = T.cumsum(D[ind])
    cum_DW_D = cum_DW / cum_D / 2  # tmp = W[ind] - cum_DW_D
    tmp = W[ind][:-1] - cum_DW_D[:-1]
    tmp1 = W[ind][1:] - cum_DW_D[:-1]
    tmp3 = tmp * tmp1

    mask = T.lt(tmp3, 0)

    tmp4 = mask.nonzero()[0].shape[0]
    tmp5 = cum_DW_D[mask.nonzero()] * cum_DW_D[mask.nonzero()] * cum_D[
        mask.nonzero()]
    bb = cum_DW_D[mask.nonzero()][T.argmax(tmp5)]

    from theano.ifelse import ifelse
    thres = ifelse(T.gt(tmp4, 0), bb, 0.7 * cum_DW_D[-1])

    return thres
Пример #28
0
    def get_output_for(self, input, **kwargs):
        # _, _, n_in0, n_in1 = self.input_shape
        # _, _, n_out0, n_out1 = self.get_output_shape()

        # Variable stride across the input creates fractional reduction
        # a = theano.shared(
        #     np.array([2] * (n_in0 - n_out0) + [1] * (2 * n_out0 - n_in0), dtype=np.int8),
        #     borrow=True)
        # b = theano.shared(
        #     np.array([2] * (n_in1 - n_out1) + [1] * (2 * n_out1 - n_in1), dtype=np.int8),
        #     borrow=True)
        self.a_shared.set_value(self.a_init, borrow=True)
        self.b_shared.set_value(self.b_init, borrow=True)

        a, b = self.a_shared, self.b_shared
        # Randomize the input strides
        a = self._theano_shuffled(a)
        b = self._theano_shuffled(b)

        # Convert to input positions, starting at 0
        a = T.concatenate(([0], a[:-1]))
        b = T.concatenate(([0], b[:-1]))
        a = T.cumsum(a)
        b = T.cumsum(b)

        # Positions of the other corners
        c = T.clip(a + 1, 0, self.input_shape[2] - 1)
        d = T.clip(b + 1, 0, self.input_shape[3] - 1)

        # Index the four positions in the pooling window and stack them
        #shit won't fit in GPU memory
        temp = T.stack(input[:, :, a, :][:, :, :, b], input[:, :,
                                                            c, :][:, :, :, b],
                       input[:, :, a, :][:, :, :, d], input[:, :,
                                                            c, :][:, :, :, d])

        return self.pool_function(temp, axis=0)
Пример #29
0
    def log_z_given_v(self, v):
        Wx_plusb = T.dot(v, self.W.T) + self.b

        energies = T.nnet.softplus(Wx_plusb)  # Sum over h'

        if self.penalty == "softplus_bi":
            energies -= self.beta * T.nnet.softplus(
                self.b)  # Add penality term
        elif self.penalty == "softplus0":
            energies -= self.beta * T.nnet.softplus(0)  # Add penality term
        else:
            raise NameError("Invalid penalty term")

        energies = T.cumsum(energies, axis=1)  # Cumsum over z
        return energies
Пример #30
0
    def _get_hidden_layer_connectivity(self, layerIdx):
        layer_size = self._hidden_sizes[layerIdx]
        if layerIdx == 0:
            p_vals = self._get_p(T.min(self.layers_connectivity[layerIdx]))
        else:
            p_vals = self._get_p(
                T.min(self.layers_connectivity_updates[layerIdx - 1]))

        # #Implementations of np.choose in theano GPU
        # return T.nonzero(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX))[1].astype(dtype=theano.config.floatX)
        # return T.argmax(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX), axis=1)
        return T.sum(T.cumsum(self._mrng.multinomial(
            pvals=T.tile(p_vals[::-1][None, :], (layer_size, 1)),
            dtype=theano.config.floatX),
                              axis=1),
                     axis=1)
Пример #31
0
    def get_nll(self, input):
        input_times_W = input.T[:, :, None] * self.W[:, None, :]

        #acc_input_times_W = T.concatenate([T.zeros_like(input_times_W[[0]]), T.cumsum(input_times_W, axis=0)[:-1]], axis=0)
        # Hack for no GPUSplit
        acc_input_times_W = T.cumsum(input_times_W, axis=0)
        #acc_input_times_W = T.roll(acc_input_times_W, 1, axis=1???)  # USES Join internally too
        acc_input_times_W = T.set_subtensor(acc_input_times_W[1:], acc_input_times_W[:-1])
        acc_input_times_W = T.set_subtensor(acc_input_times_W[0, :], 0.0)

        acc_input_times_W += self.b[None, None, :]
        h = self.hidden_activation(acc_input_times_W)

        pre_output = T.sum(h * self.W_prime[:, None, :], axis=2) + self.b_prime[:, None]
        output = T.nnet.sigmoid(pre_output)
        nll = T.sum(T.nnet.softplus(-input.T * pre_output + (1 - input.T) * pre_output), axis=0)
        return nll, output
Пример #32
0
def sample_categorical(rng, p, axis = -1, values = None):
    """
    p is a n-d array, where the final dimension is a discrete distibution (does not need to be normalized).
    Sample from that distribution.
    This will return an array of shape p.shape[:-1] with values in range [0, p.shape[-1])

    :param rng: A theano shared_randomstreams.RandomStream object
    :param p: An ndarray of arbitrary shape, where the values along (axis) are interpreted as an unnormalized
        discrete probability distribution (so if p.shape[2]==5, it means that the variable can take on 5 possible
        values).
    :param axis: The axis which we consider to be the distribution (only -1 (last axis)) supported now.
    :param values: The values of the variable.  len(values) must equal p.shape[axis].  If not included, the
        values will be considered to be integers in range(0, p.shape[axis])
    """
    # TODO: assert no negative values in p / assert p normalized along axis instead of dividing
    # TODO: assert len(values) == p.shape[axis]
    assert axis == -1, 'Currenly you can only sample along the last axis.'
    p = p/tt.sum(p, axis = axis, keepdims=True)
    # TODO: Check that differnt RNGs are doing the same thing!

    if isinstance(rng, TensorVariable):
        # Externally generated random numbers - we receive the maximum number of uniform random numbers
        # we could need, and then generate samplews from thos.
        old_p_shape = p.shape
        random_numbers = rng[:p.size].reshape((p.size, 1))
        cumulative_prob_mass = tt.cumsum(p.reshape((-1, p.shape[-1])), axis = 1)
        samples = random_numbers < cumulative_prob_mass
        samples.reshape(old_p_shape)
    elif isinstance(rng, MRG_RandomStreams):
        # MRG_RandomStreams is faster but only works for 2-d pvals, so we have to reshape and
        # then unreshape.
        old_p_shape = p.shape
        samples = rng.multinomial(n=1, pvals = p.reshape((-1, p.shape[-1])))
        samples = samples.reshape(old_p_shape)
    elif isinstance(rng, CURAND_RandomStreams):
        # TODO: Make this work if possible - problem now is it needs to know shape in advance
        raise NotImplementedError("Curand doesn't work yet.")
        cumulative_prob_mass = np.cumsum(p, axis = axis)
        samples = rng.uniform(size = tt.set_subtensor(p.shape[axis], 1)) > cumulative_prob_mass
    else:
        samples = tt.switch(tt.eq(p.size, 0), tt.zeros(p.shape), rng.multinomial(n=1, pvals = tt.switch(tt.eq(p.size, 0), 1, p)))

    indices = tt.argmax(samples, axis = -1)  # Argmax is just a way to find the location of the only element that is 1.
    if values is not None:
        return values[indices]
    return indices
Пример #33
0
    def CTR_AUC(self, y):

        #ll = T.ones((y.shape[0] * 1), 'int8');

        py = self.p_y_given_x[T.arange(y.shape[0]), 1]

        py_si = T.argsort(-py)

        py_s = T.cumsum(y[py_si])

        score = T.sum(T.dot(py_s, 1 - y[py_si]))

        score = score * 1.0 / T.sum(y)

        score = score / (y.shape[0] - T.sum(y))

        return 1 - score
Пример #34
0
    def CTR_AUC(self, y):
        
        #ll = T.ones((y.shape[0] * 1), 'int8');

        py = self.p_y_given_x[T.arange(y.shape[0]), 1];
        
        py_si = T.argsort(-py);
        
        py_s = T.cumsum(y[py_si]);
        
        score = T.sum(T.dot(py_s, 1-y[py_si]));
        
        score = score*1.0 / T.sum(y);
        
        score = score/(y.shape[0] - T.sum(y));
        
        return 1-score;
Пример #35
0
 def __init__(self, rng, x, topic_num=100):
     
     #input
     L2_input = sparse.csr_matrix("x",dtype=theano.config.floatX)
     #params
     vocab_size = x.shape[1]
     mu, sigma = x.data.mean(), x.data.var()**0.5
     
     rng = numpy.random.RandomState(numpy.random.randint(2**32-1)) if rng is None else rng
     self.L2_w = theano.shared(\
         numpy.asarray(\
             rng.normal(loc=mu,scale=sigma,size=(vocab_size, topic_num)),\
             dtype=theano.config.floatX\
         ),\
         borrow=True\
     )
     self.L2_b = theano.shared(numpy.zeros(topic_num,dtype=theano.config.floatX), borrow=True)
     self.params = [self.L2_w, self.L2_b]
     
     #stick-breaking:sticks->orthgonal sticks
     L2_stick = sparse.dot(L2_input,self.L2_w)+self.L2_b-\
         0.5*(L2_input.size/vocab_size*tensor.sum(self.L2_w**2,0)+self.L2_b**2)  
     zero_space = tensor.zeros((L2_input.shape[0],1),dtype=theano.config.floatX)
     L2_orth_stick = tensor.join(1, L2_stick, zero_space)\
         - tensor.join(1, zero_space, tensor.cumsum(L2_stick,1))
     Pasterik_orth_stick = tensor.log(1 + tensor.exp(L2_orth_stick))      
     #training model definition
     Likelihood = tensor.mean(Pasterik_orth_stick)
     grads = theano.grad(Likelihood, self.params)#gradient w.r.t params
     eta = tensor.scalar("eta")
     updates = [(param, param+eta*grad) for param, grad in zip(self.params, grads)]
     self._fit = theano.function(\
         inputs=[L2_input, eta],\
         outputs=Likelihood,\
         updates=updates\
     )
     #predict model definition
     self._predict = theano.function(\
         inputs=[L2_input],\
         outputs=tensor.argmax(L2_stick,axis=-1)\
     )
     self._codec = theano.function(\
         inputs=[L2_input],\
         outputs=L2_stick>0\
     )
Пример #36
0
    def DYNAMICS(self, STATE, ACTION):

        OLD_ANGLES = STATE[0:self.n]
        OLD_VELOCITY = STATE[self.n:-2]

        FRICTIONLESS = self.inertia * OLD_VELOCITY + (1 -
                                                      self.inertia) * ACTION
        NEW_VELOCITY = (1 - self.friction) * FRICTIONLESS

        # NEW_ANGLES = OLD_ANGLES + NEW_VELOCITY
        NEW_ANGLES = OLD_ANGLES + OLD_VELOCITY

        ABSOLUTE_ANGLES = tns.cumsum(NEW_ANGLES)

        X = tns.sum(self.lengths * np.cos(ABSOLUTE_ANGLES))
        Y = tns.sum(self.lengths * np.sin(ABSOLUTE_ANGLES))

        return tns.concatenate([NEW_ANGLES, NEW_VELOCITY, [X, Y]])
Пример #37
0
    def test_sample_z_given_v(self):
        v = T.matrix('v')
        h = T.matrix('h')
        z = T.iscalar('z')
        E = theano.function([v, h, z], logsumexp(-self.model.E(v, h, z)))

        v1 = np.random.rand(1, self.input_size).astype(config.floatX)
        H = cartesian([(0, 1)] * self.hidden_size, dtype=config.floatX)

        energies = []
        for z in range(1, self.hidden_size + 1):
            h = np.array(H[::2**(self.hidden_size - z)])
            energies.append(E(v1, h, z))

        probs = T.nnet.softmax(T.stack(energies))
        expected_icdf = T.cumsum(probs[:, ::-1], axis=1)[:, ::-1].eval()

        # Test inverse cdf
        v = T.matrix('v')
        icdf_z_given_v = theano.function([v], self.model.icdf_z_given_v(v))
        assert_array_almost_equal(icdf_z_given_v(v1), expected_icdf)

        batch_size = 500000
        self.model.batch_size = batch_size
        sample_zmask_given_v = theano.function(
            [v], self.model.sample_zmask_given_v(v))
        v2 = np.tile(v1, (self.model.batch_size, 1))

        #theano.printing.pydotprint(sample_zmask_given_v)

        z_mask = sample_zmask_given_v(v2)
        # First hidden units should always be considered i.e. z_mask[:, 0] == 1
        assert_equal(np.sum(z_mask[:, 0] == 0, axis=0), 0)

        # Test that sampled masks are as expected i.e. equal expected_icdf
        freq_per_z = np.sum(z_mask, axis=0) / self.model.batch_size
        assert_array_almost_equal(
            freq_per_z,
            expected_icdf[0],
            decimal=3,
            err_msg=
            "Tested using MC sampling, rerun it to be certain that is an error or increase 'batch_size'."
        )
Пример #38
0
    def scan(self, x, rnd, h0, wph, wpx, wpb, whh, whx, whz, whb, z_embeddings,
             *params):
        assert len(params) == len(self.mlp_p.params) + len(self.mlp_h.params)
        params_p = params[:len(self.mlp_p.params)]
        params_h = params[len(self.mlp_p.params):]

        ctx_p = self.activation(T.dot(h0, wph) + T.dot(x, wpx) + wpb)
        pz = self.mlp_p.call_on_params(ctx_p, params_p)
        assert pz.ndim == 1
        cs = T.cumsum(pz, axis=0)
        sel = T.sum(T.gt(rnd, cs))
        sel = T.clip(sel, 0, self.z_k - 1)
        pzs = pz[sel]

        ze = z_embeddings[sel, :]
        ctx_h = self.activation(
            T.dot(h0, whh) + T.dot(x, whx) + T.dot(ze, whz) + whb)
        hd = self.mlp_h.call_on_params(ctx_h, params_h)
        h1 = h0 + hd
        return h1, sel, pzs
Пример #39
0
    def get_output_for(self, policy, greedy=False, **kwargs):
        """
        picks the action with probabilities from policy
        :param policy: probabilities for all actions (e.g. a2c actor policy or standartized Q-values)
        :type policy: tensor of float[batch_id, action_id]
        
        :returns: actions ids of actions picked  
        :rtype: vector of int[batch_id]
        """
        if greedy:
            # greedy branch
            chosen_action_ids = T.argmax(policy,
                                         axis=-1).astype(self.action_dtype)

        else:
            # probabilistic branch
            batch_size, n_actions = policy.shape

            if self.assume_normalized:
                probas = policy
            else:
                probas = policy / T.sum(policy, axis=1, keepdims=True)

            # p1, p1+p2, p1+p2+p3, ... 1
            cum_probas = T.cumsum(probas, axis=1)

            batch_randomness = self.rng.uniform(low=0.,
                                                high=1.,
                                                size=[batch_size, 1])

            # idea: to compute the chosen action we count how many cumulative probabilities are
            # less than the random number [0,1].
            # we deliberately exclude the LAST cumulative probability because it has to be equal to 1
            # by definition (never being less than random[0,1]), but it can be less due to
            # inaccurate float32 computation, causing algorithm to pick action id = (n_actions)+1
            # which results in IndexError
            chosen_action_ids = T.sum((batch_randomness > cum_probas[:, :-1]),
                                      axis=1,
                                      dtype=self.action_dtype)

        return chosen_action_ids
Пример #40
0
    def get_output_for(self,policy,greedy=False,**kwargs):
        """
        picks the action with probabilities from policy
        arguments:
            policy float[batch_id, action_id]: policy values for all actions (e.g. Qvalues of action probabilities)
        returns:
            actions int[batch_id]: ids of actions picked  
        """
        
        ##probablistic branch
        if not greedy:
            batch_size,n_actions = policy.shape

            if self.assume_normalized:
                probas = policy
            else:
                probas = policy / T.sum(policy,axis=1,keepdims=True)


            #p1, p1+p2, p1+p2+p3, ... 1
            cum_probas = T.cumsum(probas,axis=1)


            batch_randomness = self.rng.uniform(low=0.,high=1., size = [probas.shape[0],1])


            #idea: to compute the chosen action we count how many cumulative probabilities are 
            #less than  the random number [0,1].
            #we deliberately exclude the LAST cumulative probability because it has to equal 1
            # by definition (never being less than random[0,1]), but it can be less due to
            #inaccurate float32 computation, causing algorithm to pick action id = (n_actions)+1
            #which results in IndexError
            chosen_action_ids = T.sum((batch_randomness > cum_probas[:,:-1]), axis=1, dtype=self.action_dtype)
            
        
        else: #greedy branch
        
            chosen_action_ids = T.argmax(policy,axis=-1).astype(self.action_dtype)

        return chosen_action_ids
Пример #41
0
def add_exploration(recognizer, data, train_conf):

    prediction = None
    prediction_mask = None
    explore_conf = train_conf.get('exploration', 'imitative')
    if explore_conf in ['greedy', 'mixed']:
        length_expand = 10
        prediction = recognizer.get_generate_graph(
            n_steps=recognizer.labels.shape[0] + length_expand)['outputs']
        prediction_mask = tensor.lt(
            tensor.cumsum(tensor.eq(prediction, data.eos_label), axis=0),
            1).astype(floatX)
        prediction_mask = tensor.roll(prediction_mask, 1, 0)
        prediction_mask = tensor.set_subtensor(
            prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :]))

        if explore_conf == 'mixed':
            batch_size = recognizer.labels.shape[1]
            targets = tensor.concatenate([
                recognizer.labels,
                tensor.zeros((length_expand, batch_size), dtype='int64')])

            targets_mask = tensor.concatenate([
                recognizer.labels_mask,
                tensor.zeros((length_expand, batch_size), dtype=floatX)])
            rng = MRG_RandomStreams()
            generate = rng.binomial((batch_size,), p=0.5, dtype='int64')
            prediction = (generate[None, :] * prediction +
                          (1 - generate[None, :]) * targets)
            prediction_mask = (tensor.cast(generate[None, :] *
                                           prediction_mask, floatX) +
                               tensor.cast((1 - generate[None, :]) *
                                           targets_mask, floatX))

        prediction_mask = theano.gradient.disconnected_grad(prediction_mask)
    elif explore_conf != 'imitative':
        raise ValueError

    return prediction, prediction_mask
Пример #42
0
    def test_sample_z_given_v(self):
        v = T.matrix('v')
        h = T.matrix('h')
        z = T.iscalar('z')
        E = theano.function([v, h, z], logsumexp(-self.model.E(v, h, z)))

        v1 = np.random.rand(1, self.input_size).astype(config.floatX)
        H = cartesian([(0, 1)] * self.hidden_size, dtype=config.floatX)

        energies = []
        for z in range(1, self.hidden_size+1):
            h = np.array(H[::2**(self.hidden_size-z)])
            energies.append(E(v1, h, z))

        probs = T.nnet.softmax(T.stack(energies))
        expected_icdf = T.cumsum(probs[:, ::-1], axis=1)[:, ::-1].eval()

        # Test inverse cdf
        v = T.matrix('v')
        icdf_z_given_v = theano.function([v], self.model.icdf_z_given_v(v))
        assert_array_almost_equal(icdf_z_given_v(v1), expected_icdf)

        batch_size = 500000
        self.model.batch_size = batch_size
        sample_zmask_given_v = theano.function([v], self.model.sample_zmask_given_v(v))
        v2 = np.tile(v1, (self.model.batch_size, 1))

        #theano.printing.pydotprint(sample_zmask_given_v)

        z_mask = sample_zmask_given_v(v2)
        # First hidden units should always be considered i.e. z_mask[:, 0] == 1
        assert_equal(np.sum(z_mask[:, 0] == 0, axis=0), 0)

        # Test that sampled masks are as expected i.e. equal expected_icdf
        freq_per_z = np.sum(z_mask, axis=0) / self.model.batch_size
        assert_array_almost_equal(freq_per_z, expected_icdf[0], decimal=3, err_msg="Tested using MC sampling, rerun it to be certain that is an error or increase 'batch_size'.")
Пример #43
0
 def sample_zmask_given_v(self, v):
     p = self.theano_rng.multinomial(pvals=self.pdf_z_given_v(v), dtype=theano.config.floatX)
     return T.cumsum(p[:, ::-1], axis=1)[:, ::-1]
Пример #44
0
    def __init__(self, input=None, n_visible=784, n_hidden=500, \
        W=None, hbias=None, vbias=None, numpy_rng = None, theano_rng=None,
        batch_size=1, n_beta=10, beta_lbound=0., n_swaps=10, 
        n_rtime=1, rtime_a=1, rtime_b=100, tau=None):

        """ 
        RBM constructor. Defines the parameters of the model along with
        basic operations for inferring hidden from visible (and vice-versa), 
        as well as for performing CD updates.

        :param input: None for standalone RBMs or symbolic variable if RBM is
         part of a larger graph.
        :param n_visible: number of visible units
        :param n_hidden: number of hidden units
        :param W: None for standalone RBMs or symbolic variable pointing to a
         shared weight matrix in case RBM is part of a DBN network; in a DBN,
         the weights are shared between RBMs and layers of a MLP
        :param hbias: None for standalone RBMs or symbolic variable pointing 
         to a shared hidden units bias vector in case RBM is part of a 
         different network
        :param vbias: None for standalone RBMs or a symbolic variable 
         pointing to a shared visible units bias
        :param n_rtime: time constant is inversely proportional to n_rtime x <return time>
        :param tau: optional fixed time constant (overries n_rtime)
        """
        self.n_visible = n_visible
        self.n_hidden  = n_hidden
  
        # deal with random number generation
        self.numpy_rng = numpy_rng if numpy_rng is not None else numpy.random.RandomState(123)
        self.theano_rng = theano_rng if theano_rng is not None else RandomStreams(self.numpy_rng.randint(2**30))

        if W is None : 
           # W is initialized with `initial_W` which is uniformely sampled
           # from -4*sqrt(6./(n_visible+n_hidden)) and 4*sqrt(6./(n_hidden+n_visible))
           # the output of uniform if converted using asarray to dtype 
           # theano.config.floatX so that the code is runable on GPU
           initial_W = 0.01 * self.numpy_rng.randn(n_visible, n_hidden)
           initial_W = numpy.asarray(initial_W, dtype = theano.config.floatX)
           # theano shared variables for weights and biases
           W = theano.shared(value = initial_W, name = 'W')

        if hbias is None :
           # create shared variable for hidden units bias
           hbias = theano.shared(value = numpy.zeros(n_hidden, 
                               dtype = theano.config.floatX), name='hbias')

        if vbias is None :
           # create shared variable for visible units bias
           vbias = theano.shared(value =numpy.zeros(n_visible, 
                                dtype = theano.config.floatX),name='vbias')

        # initialize input layer for standalone RBM or layer0 of DBN
        self.input = input 
        if not input:
            self.input = T.matrix('input')

        self.W          = W
        self.hbias      = hbias
        self.vbias      = vbias

        bufsize = 100

        #########################################################################
        # Fields indexed by mixstat:    nvis, E, beta, labels, rtime
        # Fields indexed by temp index: mixstat, fup_target, nup, ndown, swapstat
        #########################################################################

        ### initialize tempering stuff ###
        self.batch_size = batch_size   # size of negative minibatch
        self.n_beta  = theano.shared(n_beta, name='n_beta') # number of temperatures in system
        self.n_chain = theano.shared(batch_size * n_beta, name='n_chain') # number of active chains in nvis array

        self._nvis = theano.shared(self.numpy_rng.randint(0,2,size=(batch_size*bufsize, n_visible)), name='nvis')
        self.nvis = self._nvis[:self.n_chain]

        # vectors containing energy and free-energy of current negative particles (at T=1)
        self._E = theano.shared(numpy.zeros(batch_size*bufsize), name='E')
        self.E = self._E[:self.n_chain]

        ## Betas are parametrized as delta_bi = exp(\lambda_i)
        ## Resulting betas are linearly spaced between 1 and 0

        # shared parameters are the lambda_i
        lambdas = numpy.zeros(bufsize)  # leave room to grow ...        
        lambdas[:n_beta-2] = numpy.log((1.0 - beta_lbound)/(n_beta-1))
        self._lambdas = theano.shared(lambdas, name='lambdas')
        self.lambdas = self._lambdas[:n_beta-2]

        # initialize data structure to map nhid/nvis rows to a given temperature
        mixstat = numpy.zeros((batch_size, bufsize), dtype='int32')
        mixstat[:, :n_beta] = numpy.arange(batch_size*n_beta).reshape(batch_size, n_beta)
        self._mixstat = theano.shared(mixstat, name='mixstat')
        self.mixstat = self._mixstat[:, :self.n_beta]

        # convert lambdas to actual beta values
        _betas1 = 1 - T.cumsum(T.exp(self.lambdas))
        _betas2 = T.join(0, T.shape_padright(1.0), _betas1)
        _betas3 = T.join(0, _betas2, T.shape_padright(beta_lbound))
        self.betas = _betas3
        self.mixed_betas = pt_mix(self.betas, self.mixstat)
        self.mixed_betas_matrix = T.shape_padright(self.mixed_betas)

        self.get_betas = theano.function([], self.betas)

        # labels: 1 means going up in temperature, 0 going down in temperature
        labels = LBL_NONE * numpy.ones(batch_size*bufsize, dtype='int32')
        labels[mixstat[:,0]] = LBL_UP
        self.labels = theano.shared(labels, name='labels') 

        # configure histogram of up moving particles
        _nup = numpy.zeros(bufsize)
        _nup[:n_beta] = numpy.linspace(1,0,n_beta)
        self._nup = theano.shared(_nup, name='nup')
        self.nup = self._nup[:self.n_beta]
        
        # configure histogram of down moving particles
        _ndown = numpy.zeros(bufsize)
        _ndown[:n_beta] = numpy.linspace(0,1,n_beta)
        self._ndown = theano.shared(_ndown, name='ndown')
        self.ndown = self._ndown[:self.n_beta]

        # return time
        rtime = numpy.zeros(batch_size*bufsize, dtype='int32')
        self.rtime = theano.shared(rtime, name='rtime') 
        self.avg_rtime = theano.shared(
                numpy.asarray(rtime_deo(0.4,n_beta), dtype=theano.config.floatX), 
                name='avg_rtime')

        # use return time as the time constant for all moving averages
        if not tau:
            self.tau = rtime_a/(n_rtime*self.avg_rtime + rtime_b)
        else:
            self.tau = T.as_tensor(tau)
        self.get_tau = theano.function([], self.tau)

        # create PT Op
        self.n_swaps = n_swaps
        self._swapstat = theano.shared(numpy.zeros(bufsize), name='swapstat')
        self.swapstat = self._swapstat[:self.n_beta]

        self.pt_swaps = PT_Swaps(n_swaps=self.n_swaps, seed=self.numpy_rng.randint(1 << 32))
Пример #45
0
 def backward(self, y):
     x = tt.zeros(y.shape)
     x = tt.inc_subtensor(x[..., 0], y[..., 0])
     x = tt.inc_subtensor(x[..., 1:], tt.exp(y[..., 1:]))
     return tt.cumsum(x, axis=-1)
Пример #46
0
    def compute_output(self, network, in_vw):
        if network.find_hyperparameter(["deterministic"]):
            import warnings
            warnings.warn("OverlappingRandomFractionalMaxPool2DNode has "
                          "no deterministic implementation")

        pool_size = network.find_hyperparameter(["pool_size"])
        assert len(pool_size) == 2
        for alpha in pool_size:
            assert 1 < alpha < 2
        pool_fn = network.find_hyperparameter(["pool_function"], T.max)

        # NOTE: MRG_RandomStreams doesn't have "permutation"
        srng = T.shared_randomstreams.RandomStreams()

        def theano_shuffled(in_vw):
            n = in_vw.shape[0]

            shuffled = T.permute_row_elements(in_vw.T, srng.permutation(n=n)).T
            return shuffled

        out_shape = list(in_vw.shape)
        for axis, alpha in zip([2, 3], pool_size):
            out_shape[axis] = int(np.ceil(float(out_shape[axis]) / alpha))
        out_shape = tuple(out_shape)

        n_in0, n_in1 = in_vw.shape[2:]
        n_out0, n_out1 = out_shape[2:]

        # Variable stride across the input creates fractional reduction
        a = theano.shared(
            np.array([2] * (n_in0 - n_out0) + [1] * (2 * n_out0 - n_in0)))
        b = theano.shared(
            np.array([2] * (n_in1 - n_out1) + [1] * (2 * n_out1 - n_in1)))

        # Randomize the input strides
        a = theano_shuffled(a)
        b = theano_shuffled(b)

        # Convert to input positions, starting at 0
        a = T.concatenate(([0], a[:-1]))
        b = T.concatenate(([0], b[:-1]))
        a = T.cumsum(a)
        b = T.cumsum(b)

        # Positions of the other corners
        c = T.clip(a + 1, 0, n_in0 - 1)
        d = T.clip(b + 1, 0, n_in1 - 1)

        # Index the four positions in the pooling window and stack them
        in_var = in_vw.variable
        temp = T.stack(in_var[:, :, a, :][:, :, :, b],
                       in_var[:, :, c, :][:, :, :, b],
                       in_var[:, :, a, :][:, :, :, d],
                       in_var[:, :, c, :][:, :, :, d])

        out_var = pool_fn(temp, axis=0)

        network.create_vw(
            "default",
            variable=out_var,
            shape=out_shape,
            tags={"output"}
        )
Пример #47
0
  def __init__(self,
               n_out = None,
               n_units = None,
               direction = 1,
               truncation = -1,
               sampling = 1,
               encoder = None,
               unit = 'lstm',
               n_dec = 0,
               attention = "none",
               recurrent_transform = "none",
               recurrent_transform_attribs = "{}",
               attention_template = 128,
               attention_distance = 'l2',
               attention_step = "linear",
               attention_beam = 0,
               attention_norm = "exp",
               attention_momentum = "none",
               attention_sharpening = 1.0,
               attention_nbest = 0,
               attention_store = False,
               attention_smooth = False,
               attention_glimpse = 1,
               attention_filters = 1,
               attention_accumulator = 'sum',
               attention_loss = 0,
               attention_bn = 0,
               attention_lm = 'none',
               attention_ndec = 1,
               attention_memory = 0,
               attention_alnpts = 0,
               attention_epoch  = 1,
               attention_segstep=0.01,
               attention_offset=0.95,
               attention_method="epoch",
               attention_scale=10,
               context=-1,
               base = None,
               aligner = None,
               lm = False,
               force_lm = False,
               droplm = 1.0,
               forward_weights_init=None,
               bias_random_init_forget_shift=0.0,
               copy_weights_from_base=False,
               segment_input=False,
               join_states=False,
               sample_segment=None,
               **kwargs):
    """
    :param n_out: number of cells
    :param n_units: used when initialized via Network.from_hdf_model_topology
    :param direction: process sequence in forward (1) or backward (-1) direction
    :param truncation: gradient truncation
    :param sampling: scan every nth frame only
    :param encoder: list of encoder layers used as initalization for the hidden state
    :param unit: cell type (one of 'lstm', 'vanilla', 'gru', 'sru')
    :param n_dec: absolute number of steps to unfold the network if integer, else relative number of steps from encoder
    :param recurrent_transform: name of recurrent transform
    :param recurrent_transform_attribs: dictionary containing parameters for a recurrent transform
    :param attention_template:
    :param attention_distance:
    :param attention_step:
    :param attention_beam:
    :param attention_norm:
    :param attention_sharpening:
    :param attention_nbest:
    :param attention_store:
    :param attention_align:
    :param attention_glimpse:
    :param attention_lm:
    :param base: list of layers which outputs are considered as based during attention mechanisms
    :param lm: activate RNNLM
    :param force_lm: expect previous labels to be given during testing
    :param droplm: probability to take the expected output as predecessor instead of the real one when LM=true
    :param bias_random_init_forget_shift: initialize forget gate bias of lstm networks with this value
    """
    source_index = None
    if len(kwargs['sources']) == 1 and (kwargs['sources'][0].layer_class.endswith('length') or kwargs['sources'][0].layer_class.startswith('length')):
      kwargs['sources'] = []
      source_index = kwargs['index']
    unit_given = unit
    from Device import is_using_gpu
    if unit == 'lstm':  # auto selection
      if not is_using_gpu():
        unit = 'lstme'
      elif recurrent_transform == 'none' and (not lm or droplm == 0.0):
        unit = 'lstmp'
      else:
        unit = 'lstmc'
    elif unit in ("lstmc", "lstmp") and not is_using_gpu():
      unit = "lstme"
    if segment_input:
      if is_using_gpu():
        unit = "lstmps"
      else:
        unit = "lstms"
    if n_out is None:
      assert encoder
      n_out = sum([enc.attrs['n_out'] for enc in encoder])
    kwargs.setdefault("n_out", n_out)
    if n_units is not None:
      assert n_units == n_out
    self.attention_weight = T.constant(1.,'float32')
    if len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('length'):
      kwargs['sources'] = []
    elif len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('signal'):
      kwargs['sources'] = []
    super(RecurrentUnitLayer, self).__init__(**kwargs)
    self.set_attr('from', ",".join([s.name for s in self.sources]) if self.sources else "null")
    self.set_attr('n_out', n_out)
    self.set_attr('unit', unit_given.encode("utf8"))
    self.set_attr('truncation', truncation)
    self.set_attr('sampling', sampling)
    self.set_attr('direction', direction)
    self.set_attr('lm', lm)
    self.set_attr('force_lm', force_lm)
    self.set_attr('droplm', droplm)
    if bias_random_init_forget_shift:
      self.set_attr("bias_random_init_forget_shift", bias_random_init_forget_shift)
    self.set_attr('attention_beam', attention_beam)
    self.set_attr('recurrent_transform', recurrent_transform.encode("utf8"))
    if isinstance(recurrent_transform_attribs, str):
      recurrent_transform_attribs = json.loads(recurrent_transform_attribs)
    if attention_template is not None:
      self.set_attr('attention_template', attention_template)
    self.set_attr('recurrent_transform_attribs', recurrent_transform_attribs)
    self.set_attr('attention_distance', attention_distance.encode("utf8"))
    self.set_attr('attention_step', attention_step.encode("utf8"))
    self.set_attr('attention_norm', attention_norm.encode("utf8"))
    self.set_attr('attention_sharpening', attention_sharpening)
    self.set_attr('attention_nbest', attention_nbest)
    attention_store = attention_store or attention_smooth or attention_momentum != 'none'
    self.set_attr('attention_store', attention_store)
    self.set_attr('attention_smooth', attention_smooth)
    self.set_attr('attention_momentum', attention_momentum.encode('utf8'))
    self.set_attr('attention_glimpse', attention_glimpse)
    self.set_attr('attention_filters', attention_filters)
    self.set_attr('attention_lm', attention_lm)
    self.set_attr('attention_bn', attention_bn)
    self.set_attr('attention_accumulator', attention_accumulator)
    self.set_attr('attention_ndec', attention_ndec)
    self.set_attr('attention_memory', attention_memory)
    self.set_attr('attention_loss', attention_loss)
    self.set_attr('n_dec', n_dec)
    self.set_attr('segment_input', segment_input)
    self.set_attr('attention_alnpts', attention_alnpts)
    self.set_attr('attention_epoch', attention_epoch)
    self.set_attr('attention_segstep', attention_segstep)
    self.set_attr('attention_offset', attention_offset)
    self.set_attr('attention_method', attention_method)
    self.set_attr('attention_scale', attention_scale)
    if segment_input:
      if not self.eval_flag:
      #if self.eval_flag:
        if isinstance(self.sources[0],RecurrentUnitLayer):
          self.inv_att = self.sources[0].inv_att #NBT
        else:
          if not join_states:
            self.inv_att = self.sources[0].attention #NBT
          else:
            assert hasattr(self.sources[0], "nstates"), "source does not have number of states!"
            ns = self.sources[0].nstates
            self.inv_att = self.sources[0].attention[(ns-1)::ns]
        inv_att = T.roll(self.inv_att.dimshuffle(2, 1, 0),1,axis=0)#TBN
        inv_att = T.set_subtensor(inv_att[0],T.zeros((inv_att.shape[1],inv_att.shape[2])))
        inv_att = T.max(inv_att,axis=-1)
      else:
        inv_att = T.zeros((self.sources[0].output.shape[0],self.sources[0].output.shape[1]))
    if encoder and hasattr(encoder[0],'act'):
      self.set_attr('encoder', ",".join([e.name for e in encoder]))
    if base:
      self.set_attr('base', ",".join([b.name for b in base]))
    else:
      base = encoder
    self.base = base
    self.encoder = encoder
    if aligner:
      self.aligner = aligner
    self.set_attr('n_units', n_out)
    unit = eval(unit.upper())(**self.attrs)
    assert isinstance(unit, Unit)
    self.unit = unit
    kwargs.setdefault("n_out", unit.n_out)
    n_out = unit.n_out
    self.set_attr('n_out', unit.n_out)
    if n_dec < 0:
      source_index = self.index
      n_dec *= -1
    if n_dec != 0:
      self.target_index = self.index
      if isinstance(n_dec,float):
        if not source_index:
          source_index = encoder[0].index if encoder else base[0].index
        lengths = T.cast(T.ceil(T.sum(T.cast(source_index,'float32'),axis=0) * n_dec), 'int32')
        idx, _ = theano.map(lambda l_i, l_m:T.concatenate([T.ones((l_i,),'int8'),T.zeros((l_m-l_i,),'int8')]),
                            [lengths], [T.max(lengths)+1])
        self.index = idx.dimshuffle(1,0)[:-1]
        n_dec = T.cast(T.ceil(T.cast(source_index.shape[0],'float32') * numpy.float32(n_dec)),'int32')
      else:
        if encoder:
          self.index = encoder[0].index
        self.index = T.ones((n_dec,self.index.shape[1]),'int8')
    else:
      n_dec = self.index.shape[0]
    # initialize recurrent weights
    self.W_re = None
    if unit.n_re > 0:
      self.W_re = self.add_param(self.create_recurrent_weights(unit.n_units, unit.n_re, name="W_re_%s" % self.name))
    # initialize forward weights
    bias_init_value = self.create_bias(unit.n_in).get_value()
    if bias_random_init_forget_shift:
      assert unit.n_units * 4 == unit.n_in  # (input gate, forget gate, output gate, net input)
      bias_init_value[unit.n_units:2 * unit.n_units] += bias_random_init_forget_shift
    self.b.set_value(bias_init_value)
    if not forward_weights_init:
      forward_weights_init = "random_uniform(p_add=%i)" % unit.n_re
    else:
      self.set_attr('forward_weights_init', forward_weights_init)
    self.forward_weights_init = forward_weights_init
    self.W_in = []
    sample_mean, gamma = None, None
    if copy_weights_from_base:
      self.params = {}
      #self.W_re = self.add_param(base[0].W_re)
      #self.W_in = [ self.add_param(W) for W in base[0].W_in ]
      #self.b = self.add_param(base[0].b)
      self.W_re = base[0].W_re
      self.W_in = base[0].W_in
      self.b = base[0].b
      if self.attrs.get('batch_norm', False):
        sample_mean = base[0].sample_mean
        gamma = base[0].gamma
      #self.masks = base[0].masks
      #self.mass = base[0].mass
    else:
      for s in self.sources:
        W = self.create_forward_weights(s.attrs['n_out'], unit.n_in, name="W_in_%s_%s" % (s.name, self.name))
        self.W_in.append(self.add_param(W))
    # make input
    z = self.b
    for x_t, m, W in zip(self.sources, self.masks, self.W_in):
      if x_t.attrs['sparse']:
        if x_t.output.ndim == 3: out_dim = x_t.output.shape[2]
        elif x_t.output.ndim == 2: out_dim = 1
        else: assert False, x_t.output.ndim
        if x_t.output.ndim == 3:
          z += W[T.cast(x_t.output[:,:,0], 'int32')]
        elif x_t.output.ndim == 2:
          z += W[T.cast(x_t.output, 'int32')]
        else:
          assert False, x_t.output.ndim
      elif m is None:
        z += T.dot(x_t.output, W)
      else:
        z += self.dot(self.mass * m * x_t.output, W)
    #if self.attrs['batch_norm']:
    #  z = self.batch_norm(z, unit.n_in)
    num_batches = self.index.shape[1]
    self.num_batches = num_batches
    non_sequences = []
    if self.attrs['lm'] or attention_lm != 'none':
      if not 'target' in self.attrs:
        self.attrs['target'] = 'classes'
      if self.attrs['droplm'] > 0.0 or not (self.train_flag or force_lm):
        if copy_weights_from_base:
          self.W_lm_in = base[0].W_lm_in
          self.b_lm_in = base[0].b_lm_in
        else:
          l = sqrt(6.) / sqrt(unit.n_out + self.y_in[self.attrs['target']].n_out)
          values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(unit.n_out, self.y_in[self.attrs['target']].n_out)), dtype=theano.config.floatX)
          self.W_lm_in = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_in_"+self.name))
          self.b_lm_in = self.create_bias(self.y_in[self.attrs['target']].n_out, 'b_lm_in')
      l = sqrt(6.) / sqrt(unit.n_in + self.y_in[self.attrs['target']].n_out)
      values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(self.y_in[self.attrs['target']].n_out, unit.n_in)), dtype=theano.config.floatX)
      if copy_weights_from_base:
        self.W_lm_out = base[0].W_lm_out
      else:
        self.W_lm_out = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_out_"+self.name))
      if self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm):
        self.lmmask = 1
        #if recurrent_transform != 'none':
        #  recurrent_transform = recurrent_transform[:-3]
      elif self.attrs['droplm'] < 1.0 and (self.train_flag or force_lm):
        from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
        srng = RandomStreams(self.rng.randint(1234) + 1)
        self.lmmask = T.cast(srng.binomial(n=1, p=1.0 - self.attrs['droplm'], size=self.index.shape), theano.config.floatX).dimshuffle(0,1,'x').repeat(unit.n_in,axis=2)
      else:
        self.lmmask = T.zeros_like(self.index, dtype='float32').dimshuffle(0,1,'x').repeat(unit.n_in,axis=2)

    if recurrent_transform == 'input': # attention is just a sequence dependent bias (lstmp compatible)
      src = []
      src_names = []
      n_in = 0
      for e in base:
        #src_base = [ s for s in e.sources if s.name not in src_names ]
        #src_names += [ s.name for s in e.sources ]
        src_base = [ e ]
        src_names += [e.name]
        src += [s.output for s in src_base]
        n_in += sum([s.attrs['n_out'] for s in src_base])
      self.xc = T.concatenate(src, axis=2)
      l = sqrt(6.) / sqrt(self.attrs['n_out'] + n_in)
      values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, 1)), dtype=theano.config.floatX)
      self.W_att_xc = self.add_param(self.shared(value=values, borrow=True, name = "W_att_xc"))
      values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, self.attrs['n_out'] * 4)), dtype=theano.config.floatX)
      self.W_att_in = self.add_param(self.shared(value=values, borrow=True, name = "W_att_in"))
      zz = T.exp(T.tanh(T.dot(self.xc, self.W_att_xc))) # TB1
      self.zc = T.dot(T.sum(self.xc * (zz / T.sum(zz, axis=0, keepdims=True)).repeat(self.xc.shape[2],axis=2), axis=0, keepdims=True), self.W_att_in)
      recurrent_transform = 'none'
    elif recurrent_transform == 'attention_align':
      max_skip = base[0].attrs['max_skip']
      values = numpy.zeros((max_skip,), dtype=theano.config.floatX)
      self.T_b = self.add_param(self.shared(value=values, borrow=True, name="T_b"), name="T_b")
      l = sqrt(6.) / sqrt(self.attrs['n_out'] + max_skip)
      values = numpy.asarray(self.rng.uniform(
        low=-l, high=l, size=(self.attrs['n_out'], max_skip)), dtype=theano.config.floatX)
      self.T_W = self.add_param(self.shared(value=values, borrow=True, name="T_W"), name="T_W")
      y_t = T.dot(self.base[0].attention, T.arange(self.base[0].output.shape[0], dtype='float32'))  # NB
      y_t = T.concatenate([T.zeros_like(y_t[:1]), y_t], axis=0)  # (N+1)B
      y_t = y_t[1:] - y_t[:-1]  # NB
      self.y_t = y_t # T.clip(y_t,numpy.float32(0),numpy.float32(max_skip - 1))

      self.y_t = T.cast(self.base[0].backtrace,'float32')
    elif recurrent_transform == 'attention_segment':
      assert aligner.attention, "Segment-wise attention requires attention points!"

    recurrent_transform_inst = RecurrentTransform.transform_classes[recurrent_transform](layer=self)
    assert isinstance(recurrent_transform_inst, RecurrentTransform.RecurrentTransformBase)
    unit.recurrent_transform = recurrent_transform_inst
    self.recurrent_transform = recurrent_transform_inst
    # scan over sequence
    for s in range(self.attrs['sampling']):
      index = self.index[s::self.attrs['sampling']]

      if context > 0:
        from TheanoUtil import context_batched
        n_batches = z.shape[1]
        time, batch, dim = z.shape[0], z.shape[1], z.shape[2]
        #z = context_batched(z[::direction or 1], window=context)[::direction or 1] # TB(CD)

        from theano.ifelse import ifelse
        def context_window(idx, x_in, i_in):
          x_out = x_in[idx:idx + context]
          x_out = x_out.dimshuffle('x',1,0,2).reshape((1, batch, dim * context))
          i_out = i_in[idx:idx+1].repeat(context, axis=0)
          i_out = ifelse(T.lt(idx,context),T.set_subtensor(i_out[:context - idx],numpy.int8(0)),i_out).reshape((1, batch * context))
          return x_out, i_out

        z = z[::direction or 1]
        i = index[::direction or 1]
        out, _ = theano.map(context_window, sequences = [T.arange(z.shape[0])], non_sequences = [T.concatenate([T.zeros((context - 1,z.shape[1],z.shape[2]),dtype='float32'),z],axis=0), i])
        z = out[0][::direction or 1]
        i = out[1][::direction or 1] # T(BC)
        direction = 1
        z = z.reshape((time * batch, context * dim)) # (TB)(CD)
        z = z.reshape((time * batch, context, dim)).dimshuffle(1,0,2) # C(TB)D
        i = i.reshape((time, context, batch)).dimshuffle(1,0,2).reshape((context, time * batch))
        index = i
        num_batches = time * batch

      sequences = z
      sources = self.sources
      if encoder:
        if recurrent_transform == "attention_segment":
          if hasattr(encoder[0],'act'):
            outputs_info = [T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act)]
          else:
           # outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ]
            outputs_info[0] = self.aligner.output[-1]
        elif hasattr(encoder[0],'act'):
          outputs_info = [ T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act) ]
        else:
          outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ]
        sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0))
      else:
        outputs_info = [ T.alloc(numpy.cast[theano.config.floatX](0), num_batches, unit.n_units) for a in range(unit.n_act) ]

      if self.attrs['lm'] and self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm):
        if self.network.y[self.attrs['target']].ndim == 3:
          sequences += T.dot(self.network.y[self.attrs['target']],self.W_lm_out)
        else:
          y = self.y_in[self.attrs['target']].flatten()
          sequences += self.W_lm_out[y].reshape((index.shape[0],index.shape[1],unit.n_in))

      if sequences == self.b:
        sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0))

      if unit.recurrent_transform:
        outputs_info += unit.recurrent_transform.get_sorted_state_vars_initial()

      index_f = T.cast(index, theano.config.floatX)
      unit.set_parent(self)

      if segment_input:
        outputs = unit.scan_seg(x=sources,
                                z=sequences[s::self.attrs['sampling']],
                                att = inv_att,
                                non_sequences=non_sequences,
                                i=index_f,
                                outputs_info=outputs_info,
                                W_re=self.W_re,
                                W_in=self.W_in,
                                b=self.b,
                                go_backwards=direction == -1,
                                truncate_gradient=self.attrs['truncation'])
      else:
        outputs = unit.scan(x=sources,
                            z=sequences[s::self.attrs['sampling']],
                            non_sequences=non_sequences,
                            i=index_f,
                            outputs_info=outputs_info,
                            W_re=self.W_re,
                            W_in=self.W_in,
                            b=self.b,
                            go_backwards=direction == -1,
                            truncate_gradient=self.attrs['truncation'])

      if not isinstance(outputs, list):
        outputs = [outputs]
      if outputs:
        outputs[0].name = "%s.act[0]" % self.name
        if context > 0:
          for i in range(len(outputs)):
            outputs[i] = outputs[i][-1].reshape((outputs[i].shape[1]//n_batches,n_batches,outputs[i].shape[2]))

      if unit.recurrent_transform:
        unit.recurrent_transform_state_var_seqs = outputs[-len(unit.recurrent_transform.state_vars):]

      if self.attrs['sampling'] > 1:
        if s == 0:
          self.act = [ T.alloc(numpy.cast['float32'](0), self.index.shape[0], self.index.shape[1], n_out) for act in outputs ]
        self.act = [ T.set_subtensor(tot[s::self.attrs['sampling']], act) for tot,act in zip(self.act, outputs) ]
      else:
        self.act = outputs[:unit.n_act]
        if len(outputs) > unit.n_act:
          self.aux = outputs[unit.n_act:]
    if self.attrs['attention_store']:
      self.attention = [ self.aux[i].dimshuffle(0,2,1) for i,v in enumerate(sorted(unit.recurrent_transform.state_vars.keys())) if v.startswith('att_') ] # NBT
      for i in range(len(self.attention)):
        vec = T.eye(self.attention[i].shape[2], 1, -direction * (self.attention[i].shape[2] - 1))
        last = vec.dimshuffle(1, 'x', 0).repeat(self.index.shape[1], axis=1)
        self.attention[i] = T.concatenate([self.attention[i][1:],last],axis=0)[::direction]

    self.cost_val = numpy.float32(0)
    if recurrent_transform == 'attention_align':
      back = T.ceil(self.aux[sorted(unit.recurrent_transform.state_vars.keys()).index('t')])
      def make_output(base, yout, trace, length):
        length = T.cast(length, 'int32')
        idx = T.cast(trace[:length][::-1],'int32')
        x_out = T.concatenate([base[idx],T.zeros((self.index.shape[0] + 1 - length, base.shape[1]), 'float32')],axis=0)
        y_out = T.concatenate([yout[idx,T.arange(length)],T.zeros((self.index.shape[0] + 1 - length, ), 'float32')],axis=0)
        return x_out, y_out

      output, _ = theano.map(make_output,
                             sequences = [base[0].output.dimshuffle(1,0,2),
                                          self.y_t.dimshuffle(1,2,0),
                                          back.dimshuffle(1,0),
                                          T.sum(self.index,axis=0,dtype='float32')])
      self.attrs['n_out'] = base[0].attrs['n_out']
      self.params.update(unit.params)
      self.output = output[0].dimshuffle(1,0,2)[:-1]

      z = T.dot(self.act[0], self.T_W)[:-1] + self.T_b
      z = z.reshape((z.shape[0] * z.shape[1], z.shape[2]))
      idx = (self.index[1:].flatten() > 0).nonzero()
      idy = (self.index[1:][::-1].flatten() > 0).nonzero()
      y_out = T.cast(output[1],'int32').dimshuffle(1, 0)[:-1].flatten()
      nll, _ = T.nnet.crossentropy_softmax_1hot(x=z[idx], y_idx=y_out[idy])
      self.cost_val = T.sum(nll)
      recog = T.argmax(z[idx], axis=1)
      real = y_out[idy]
      self.errors = lambda: T.sum(T.neq(recog, real))

      return

      back += T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32')
      idx = (self.index[:-1].flatten() > 0).nonzero()
      idx = T.cast(back[::-1].flatten()[idx],'int32')
      x_out = base[0].output
      #x_out = x_out.dimshuffle(1,0,2).reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx]
      #x_out = x_out.reshape((self.index.shape[1], self.index.shape[0] - 1, x_out.shape[1])).dimshuffle(1,0,2)
      x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx]
      x_out = x_out.reshape((self.index.shape[0] - 1, self.index.shape[1], x_out.shape[1]))
      self.output = T.concatenate([x_out, base[0].output[1:]],axis=0)
      self.attrs['n_out'] = base[0].attrs['n_out']
      self.params.update(unit.params)
      return


      skips = T.dot(T.nnet.softmax(z), T.arange(z.shape[1], dtype='float32')).reshape(self.index[1:].shape)
      shift = T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32')
      skips = T.concatenate([T.zeros_like(self.y_t[:1]),self.y_t[:-1]],axis=0)
      idx = shift + T.cumsum(skips, axis=0)
      idx = T.cast(idx[:-1].flatten(),'int32')
      #idx = (idx.flatten() > 0).nonzero()
      #idx = base[0].attention.flatten()
      x_out = base[0].output[::-1]
      x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx]
      x_out = x_out.reshape((self.index.shape[0], self.index.shape[1], x_out.shape[1]))
      self.output = T.concatenate([base[0].output[-1:], x_out], axis=0)[::-1]
      self.attrs['n_out'] = base[0].attrs['n_out']
      self.params.update(unit.params)
      return

    if recurrent_transform == 'batch_norm':
      self.params['sample_mean_batch_norm'].custom_update = T.dot(T.mean(self.act[0],axis=[0,1]),self.W_re)
      self.params['sample_mean_batch_norm'].custom_update_normalized = True

    self.make_output(self.act[0][::direction or 1], sample_mean=sample_mean, gamma=gamma)
    self.params.update(unit.params)
Пример #48
0
def optimize_expert_weights(expert_predictions,
                            average_distribution,
                            mask_matrix=None,
                            targets=None,
                            num_cross_validation_masks=2,
                            num_folds=1,
                            eps=1e-14,
                            cutoff=0.01,
                            do_optimization=True,
                            expert_weights=None,
                            optimal_params=None,
                            special_average=False,
                            *args, **kwargs):
    """
    :param expert_predictions: experts x validation_samples x 600 x
    :param mask_matrix: experts x validation_samples x
    :param targets: validation_samples x 600 x
    :param average_distribution: 600 x
    :param eps:
    :return:
    """
    if expert_weights is not None:
        mask_matrix = mask_matrix[expert_weights>cutoff,:]  # remove
        expert_predictions = expert_predictions[expert_weights>cutoff,:,:]  # remove

    NUM_EXPERTS = expert_predictions.shape[0]
    NUM_FILTER_PARAMETERS = 2
    WINDOW_SIZE = 599

    # optimizing weights
    X = theano.shared(expert_predictions.astype('float32'))  # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600)
    x_coor = theano.shared(np.linspace(-(WINDOW_SIZE-1)/2, (WINDOW_SIZE-1)/2, num=WINDOW_SIZE, dtype='float32'))  # targets = (NUM_VALIDATIONS, 600)

    NUM_VALIDATIONS = expert_predictions.shape[1]
    ind = theano.shared(np.zeros((NUM_VALIDATIONS,), dtype='int32'))  # targets = (NUM_VALIDATIONS, 600)

    if optimal_params is None:
        params_init = np.concatenate([ np.ones((NUM_EXPERTS,), dtype='float32'),
                                       np.ones((NUM_FILTER_PARAMETERS,), dtype='float32') ])
    else:
        params_init = optimal_params.astype('float32')

    params = theano.shared(params_init.astype('float32'))
    #params = T.vector('params', dtype='float32')  # expert weights = (NUM_EXPERTS,)

    C = 0.0001
    if not special_average:
        # Create theano expression
        # inputs:
        W = params[:NUM_EXPERTS]
        weights = T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0)
        preds = X.take(ind, axis=1)
        mask = theano.shared(mask_matrix.astype('float32')).take(ind, axis=1)
        # expression
        masked_weights = mask * weights
        tot_masked_weights = T.clip(masked_weights.sum(axis=0), 1e-7, utils.maxfloat)
        preds_weighted_masked = preds * masked_weights.dimshuffle(0, 1, 'x')
        cumulative_distribution = preds_weighted_masked.sum(axis=0) / tot_masked_weights.dimshuffle(0, 'x')
        # loss
        l1_loss = weights.sum()
    else:
        # calculate the weighted average for each of these experts
        weights = generate_information_weight_matrix(expert_predictions, average_distribution)  # = (NUM_EXPERTS, NUM_VALIDATIONS, 600)
        weight_matrix = theano.shared((mask_matrix[:,:,None]*weights).astype('float32'))
        pdf = utils.cdf_to_pdf(expert_predictions)
        x_log = np.log(pdf)
        x_log[pdf<=0] = np.log(eps)
        # Compute the mean
        X_log = theano.shared(x_log.astype('float32'))  # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600)
        X_log_i = X_log.take(ind, axis=1)
        w_i = weight_matrix.take(ind, axis=1)

        W = params[:NUM_EXPERTS]
        w_i = w_i * T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0, 'x')

        #the different predictions, are the experts
        geom_av_log = T.sum(X_log_i * w_i, axis=0) / (T.sum(w_i, axis=0) + eps)
        geom_av_log = geom_av_log - T.max(geom_av_log,axis=-1).dimshuffle(0,'x')  # stabilizes rounding errors?

        geom_av = T.exp(geom_av_log)

        geom_pdf = geom_av/T.sum(geom_av,axis=-1).dimshuffle(0,'x')
        l1_loss = 0
        cumulative_distribution = T.cumsum(geom_pdf, axis=-1)

    if not do_optimization:
        ind.set_value(range(NUM_VALIDATIONS))
        f_eval = theano.function([], cumulative_distribution)
        cumulative_distribution = f_eval()
        return cumulative_distribution[0]
    else:
        # convert to theano_values (for regularization)
        t_valid = theano.shared(targets.astype('float32'))  # targets = (NUM_VALIDATIONS, 600)
        t_train = theano.shared(targets.astype('float32'))  # targets = (NUM_VALIDATIONS, 600)

    CRPS_train = T.mean((cumulative_distribution - t_train.take(ind, axis=0))**2) + C * l1_loss
    CRPS_valid = T.mean((cumulative_distribution - t_valid.take(ind, axis=0))**2)

    iter_optimize = theano.function([], CRPS_train, on_unused_input="ignore", updates=lasagne.updates.adam(CRPS_train, [params], 1.0))
    f_val = theano.function([], CRPS_valid)

    def optimize_my_params():
        for _ in xrange(40 if special_average else 100):  # early stopping
            score = iter_optimize()
        result = params.get_value()
        return result, score


    if num_cross_validation_masks==0:

        ind.set_value(range(NUM_VALIDATIONS))
        params.set_value(params_init)
        optimal_params, train_score = optimize_my_params()
        final_weights = -1e10 * np.ones(expert_weights.shape,)
        final_weights[np.where(expert_weights>cutoff)] = optimal_params[:NUM_EXPERTS]
        final_params = np.concatenate(( final_weights, optimal_params[NUM_EXPERTS:]))
        return softmax(final_weights), train_score, final_params
    else:
        final_params = []
        final_losses = []
        print
        print
        print
        for fold in xrange(num_folds):
            for i_cross_validation in xrange(num_cross_validation_masks):
                print "\r\033[F\033[F\033[Fcross_validation %d/%d"%(fold*num_cross_validation_masks+i_cross_validation+1, num_folds*num_cross_validation_masks)
                val_indices = get_cross_validation_indices(range(NUM_VALIDATIONS),
                                                       validation_index=i_cross_validation,
                                                       number_of_splits=num_cross_validation_masks,
                                                       rng_seed=fold,
                                                       )

                indices = [i for i in range(NUM_VALIDATIONS) if i not in val_indices]


                #out, crps, d = scipy.optimize.fmin_l_bfgs_b(f, w_init, fprime=g, pgtol=1e-09, epsilon=1e-08, maxfun=10000)
                ind.set_value(indices)
                params.set_value(params_init)
                result, train_score = optimize_my_params()

                final_params.append(result)

                ind.set_value(val_indices)
                validation_score = f_val()
                print "              Current train value: %.6f" % train_score
                print "         Current validation value: %.6f" % validation_score
                final_losses.append(validation_score)

        optimal_params = np.mean(final_params, axis=0)
        average_loss   = np.mean(final_losses)

        expert_weights_result = softmax(optimal_params[:NUM_EXPERTS])
        filter_param_result = optimal_params[NUM_EXPERTS:NUM_EXPERTS+NUM_FILTER_PARAMETERS]
        #print "filter param result:", filter_param_result

        return expert_weights_result, average_loss, optimal_params  # (NUM_EXPERTS,)
Пример #49
0
    def backward(self, seq_inputs, seq_h, seq_updates,
                 grad_seq_att, cov, query=None, extra_grad_seq_h=None,
                 seq_mask=None):
        """
        Parameters
        ----------
        seq_inputs: (length_seq, bs, n_features)
        seq_mask: (length_seq, bs)
        seq_h: (length_seq, bs, n_hidden)
        seq_updates: (length_seq, bs, n_hidden)
            the update vectors that have been added to the covariance matrix at
            each timestep in the step pass.
        query: (bs, n_hidden) or None
        grad_seq_att: (length_seq, bs, n_hidden)
            Gradient of the loss with respect to T.dot(cov, query)
        cov: (bs, n_hidden, n_hidden)
            Covariance matrix computed in the step pass
        """
        if not seq_mask:
            seq_mask = T.ones(seq_inputs.shape[:-1])

        if not extra_grad_seq_h:
            extra_grad_seq_h = T.zeros(grad_seq_att.shape)

        # Build backward graph of the recurrence
        (back_input, back_mask, back_h_pre, back_grad_h, back_grad_input,
         back_grad_h_pre, back_grad_params) = self.build_backward_rec_graph()

        # Build backward graph of the attention update rule
        (u_h, u_mask, u_C_pre, u_grad_att, u_query,
         (u_grad_h, u_grad_params)) = self.attention_update_rule.build_backward_graph()

        cumsum_grad_seq_att = T.cumsum(grad_seq_att[::-1], axis=0)

        def step(input, mask, cumsum_grad_att, extra_grad_h, h, h_pre, update, grad_h, C,
                 *prev_grad_params):
            """
            A single timestep of the backward pass.

            Parameters
            ----------
            input: (batch_size, n_in)
            mask: (batch_size,)
            cumsum_grad_att: (batch_size, n_hidden)
            h: (batch_size, n_hidden)
            h_pre: (batch_size, n_hidden)
            update: (batch_size, n_hidden)
            grad_h: (batch_size, n_hidden)
            C: (batch_size, n_hidden, n_hidden)
            *prev_grad_params

            Returns
            -------
            grad_input: (batch_size, n_in)
            grad_h_pre: (batch_size, n_hidden)
            C_pre: (batch_size, n_hidden, n_hidden)
            gradients with respect to the params (both of the recurrent and the
             update rule)
            """
            C_pre = self.attention_update_rule.restore_previous_matrix(C, update)

            att_grads = theano.clone(
                output=[u_grad_h] + u_grad_params,
                replace={u_h: h,
                         u_mask: mask,
                         u_C_pre: C_pre,
                         u_grad_att: cumsum_grad_att,
                         u_query: h})

            grad_h_att = att_grads[0]
            grad_params_att = att_grads[1:]

            grad_h_att *= 1000 / T.sum(seq_mask, axis=0)[:, None]
            grad_h_att = T.switch(mask[:, None], grad_h_att, .0)

            rec_grads = theano.clone(
                output=[back_grad_input, back_grad_h_pre] + back_grad_params,
                replace={back_input: input,
                         back_mask: mask,
                         back_h_pre: h_pre,
                         back_grad_h: extra_grad_h + grad_h + grad_h_att})

            grad_input = rec_grads[0]
            grad_h_pre = rec_grads[1]
            grad_params_rec = rec_grads[2:]

            grad_params = grad_params_att + grad_params_rec
            scan_outputs = [grad_input, grad_h_pre, C_pre]
            for prev_grad, grad in zip(prev_grad_params, grad_params):
                scan_outputs.append(prev_grad + grad)

            return tuple(scan_outputs)

        seq_h = T.concatenate([T.zeros_like(seq_h[0:1]), seq_h])

        params = self.attention_update_rule.params + self.rec_params
        grads, _ = theano.scan(
            fn=step,
            sequences=[seq_inputs[::-1], seq_mask[::-1], cumsum_grad_seq_att,
                       extra_grad_seq_h[::-1],
                       dict(input=seq_h[::-1], taps=[0, 1]),
                       seq_updates[::-1]],
            outputs_info=([None, T.zeros_like(seq_h[0]), cov] +
                          [T.zeros_like(m) for m in params]),
            name='backward')
        grads_input = grads[0][::-1]
        grads_param = [g[-1] for g in grads[3:]]

        return grads_input, params, grads_param
Пример #50
0
 def icdf_z_given_v(self, v):
     return T.cumsum(self.pdf_z_given_v(v)[:, ::-1], axis=1)[:, ::-1]
Пример #51
0
def monotonicity_penalty(weights, mask_x=None):
    cumsums = tensor.cumsum(weights, axis=2)
    penalties = tensor.maximum(cumsums[1:] - cumsums[:-1], 0).sum(axis=2)
    if mask_x:
        penalties *= mask_x[1:]
    return penalties.sum()
Пример #52
0
def _update_neural_stack(self, V_tm1, s_tm1, d_t, u_t, v_t, time,stack=True):
    
    ############################################################
    #Equation 1
  
    V_t = V_tm1
                  
    V_t = T.set_subtensor(V_t[::,time,::],v_t)
        
    ############################################################
    #equation 2
    if stack:
        s_op = T.cumsum(s_tm1[::,1:time][::,::-1],axis=1) #Size t-2
        s_op = s_op[::,::-1]
    #padding
        input_shape = s_op.shape
        output_shape = (input_shape[0],
                        input_shape[1] + 1)
        
        output = T.zeros(output_shape)
        s_op =  T.set_subtensor(output[:, :input_shape[1]], s_op)
    else:
        s_op = T.cumsum(s_tm1[::,:time-1],axis=1) #Size t-2
    #padding
        input_shape = s_op.shape
        output_shape = (input_shape[0],
                        input_shape[1] + 1)
        
        output = T.zeros(output_shape)
        s_op =  T.set_subtensor(output[:, 1:input_shape[1]+1], s_op)
                
    s_op = u_t.dimshuffle(0,"x") - s_op
    
    s_op = T.maximum(s_op,0)
    
    
    #ifelse to deal with time == 0
    #m = T.max()
    #ifelse(T.ge(time,1),time,T.cast(1,"int32"))
 
    s_op = s_tm1[::,:time]-s_op
    
    s_op = T.maximum(s_op,0)
    
    s_t = s_tm1
    
    
    s_t = T.set_subtensor(s_t[::,:time], s_op)
        
    s_t = T.set_subtensor(s_t[::,time], d_t)
    
    
    
    ############################################################
    #equation 3


    if stack:
        s_op = T.cumsum(s_t[::,1:time+1][::,::-1],axis=1) #Size t-1
        s_op = s_op[::,::-1]
        #left padding
        input_shape = s_op.shape
        output_shape = (input_shape[0],
                        input_shape[1] + 1)
        
        output = T.zeros(output_shape)
        s_op =  T.set_subtensor(output[:, :input_shape[1]], s_op)
    else:
        s_op = T.cumsum(s_t[::,:time],axis=1) #Size t-1
        #left padding
        input_shape = s_op.shape
        output_shape = (input_shape[0],
                        input_shape[1] + 1)
        
        output = T.zeros(output_shape)
        s_op =  T.set_subtensor(output[:,1:1+input_shape[1]], s_op)
    
    # Max operation
    s_op = 1 - s_op
    s_op = T.maximum(s_op,0)
            
    #Min operation
    s_op = T.minimum(s_t[::,:time+1],s_op)

    
    r_t = T.sum(s_op[::,:time+1].dimshuffle(0,1,"x")*V_t[::,:time+1,::],axis=1)
    
    return V_t, s_t,r_t