Python stacklists示例，theano.tensor.stacklists Python示例

示例#1

0

显示文件

文件： mixtures.py 项目： kadeng/pymc

 def __init__(self, shape, cat_var, mus, taus, model=None, *args, **kwargs):
     """
     Creates a continuous mixture distribution which can be efficiently evaluated and sampled from.
      
     Args:
         shape: Shape of the distribution. All kernels need to have this same shape as well.
         weights: weight vector (may be a theano shared variable or expression. Must be able to evaluate this in the context of the model).
         kernels: list of mixture component distributions. These have to be MixtureKernel instances (for example MvNormalKernel ) of same shape
         testval: Test value for logp computations
     """
     super(MvGaussianMixture, self).__init__(*args, shape=shape, **kwargs)
     assert isinstance(cat_var, FreeRV)
     assert isinstance(cat_var.distribution, Categorical)
     self.cat_var = cat_var
     self.model = modelcontext(model)
     weights = cat_var.distribution.p
     self.weights = weights
     self.mus = mus
     self.taus = taus
     self.mu_t = T.stacklists(mus)
     self.tau_t = T.stacklists(taus)
     self.shape = shape
     self.testval = np.zeros(self.shape, self.dtype)
     self.last_cov_value = {}
     self.last_tau_value = {}
     self.param_fn = None

示例#2

0

显示文件

def theano_rot(rx, ry, rz, rescale=True):
    '''Return a theano tensor representing a rotation 
    matrix using specified rotation angles rx,ry, rz
    If rescale is True, treat the input angles as degrees.
    '''

    if rescale:
        rx = np.pi / 180. * (rx)
        ry = np.pi / 180. * (ry)
        rz = np.pi / 180. * (rz)

    sx = tt.sin(rx)
    sy = tt.sin(ry)
    sz = tt.sin(rz)
    cx = tt.cos(rx)
    cy = tt.cos(ry)
    cz = tt.cos(rz)
    Rx = [[1, 0, 0], [0, cx, -sx], [0, sx, cx]]
    Ry = [[cy, 0, sy], [0, 1, 0], [-sy, 0, cy]]
    Rz = [[cz, -sz, 0], [sz, cz, 0], [0, 0, 1]]

    Rxt = tt.stacklists(Rx)
    Ryt = tt.stacklists(Ry)
    Rzt = tt.stacklists(Rz)
    full_rotation = tt.dot(Rzt, tt.dot(Ryt, Rxt))
    return full_rotation

示例#3

0

显示文件

文件： lib_transform.py 项目： DaveBoettger/mirrorfithmc

def theano_rot(rx,ry,rz, rescale=180./np.pi):
    '''Return a theano tensor representing a rotation 
    matrix using specified rotation angles rx,ry, rz
    Rescale can be used to change the angular units to i.e. degrees.
    '''

    rx = (rx)/rescale
    ry = (ry)/rescale
    rz = (rz)/rescale

    sx = tt.sin(rx)
    sy = tt.sin(ry)
    sz = tt.sin(rz)
    cx = tt.cos(rx)
    cy = tt.cos(ry)
    cz = tt.cos(rz)
    Rx = [[1.,0.,0.],[0.,cx,-sx],[0.,sx,cx]]
    Ry = [[cy,0.,sy],[0.,1.,0.],[-sy,0.,cy]]
    Rz = [[cz,-sz,0.],[sz,cz,0.],[0.,0.,1.]]


    Rxt = tt.stacklists(Rx)
    Ryt = tt.stacklists(Ry)
    Rzt = tt.stacklists(Rz)
    full_rotation=tt.dot(Rzt,tt.dot(Ryt, Rxt))
    return full_rotation

示例#4

0

显示文件

文件： gran.py 项目： hma02/GAN-evaluation

 def sequential_drawing(self, num_examples):
     """Fetches the sequential output of GRAN at each timestep"""
     
     canvas = self.gen_network.get_samples(num_examples)[1]
     sequential_sams = []
     for i in xrange(self.num_steps):
         sequential_sams.append(T.nnet.sigmoid(T.sum(T.stacklists(canvas[:i+1]), axis=0)))
    
     return T.stacklists(sequential_sams)

示例#5

0

显示文件

文件： renet.py 项目： SankalpaDutta/DS-Python

def renet_layer_ud(X, rnn1, rnn2, w, h, wp, hp):
    # def recurrence1(x_t, h_tm1):
    #     dot = T.dot(Wx1, x_t)
    #     h_t = relu(dot + T.dot(h_tm1, Wh1) + Bh1)
    #     return h_t
    # def recurrence2(x_t, h_tm1):
    #     dot = T.dot(Wx2, x_t)
    #     h_t = relu(dot + T.dot(h_tm1, Wh2) + Bh2)
    #     return h_t

    list_of_images = []
    for j in xrange(w/wp):
        # x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten(ndim=2)
        # reshape the row into a 2-D matrix to be fed into scan
        x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten().reshape((h/hp, X.shape[0]*wp*hp))
        # h1, _ = theano.scan(
        #     fn=recurrence1,
        #     sequences=x,
        #     outputs_info=[H01],
        #     n_steps=x.shape[0]
        # )
        # h2, _ = theano.scan(
        #     fn=recurrence2,
        #     sequences=x,
        #     outputs_info=[H02],
        #     n_steps=x.shape[0],
        #     go_backwards=True
        # )
        h1 = rnn1.output(x)
        h2 = rnn2.output(x, go_backwards=True)
        # combine the last values of s1 and s2 into an image
        img = T.concatenate([h1.T, h2.T])
        list_of_images.append(img)

    return T.stacklists(list_of_images).dimshuffle((1, 0, 2))

示例#6

0

显示文件

文件： compat.py 项目： philastrophist/gaia-xd

    def theano(self, x, mu, V, ndim, ncomp):
        cholesky = Cholesky(nofail=True, lower=True)
        solve_lower = tt.slinalg.Solve(A_structure="lower_triangular")
        if x.ndim == 1:
            onedim = True
            x = x[None, :]
        else:
            onedim = False

        delta = x[:, None, :] - mu[None, ...]

        logps = []
        for i in range(ncomp):
            _chol_cov = cholesky(V[i])
            k = floatX(ndim)
            diag = tt.nlinalg.diag(_chol_cov)
            # Check if the covariance matrix is positive definite.
            ok = tt.all(diag > 0)
            # If not, replace the diagonal. We return -inf later, but
            # need to prevent solve_lower from throwing an exception.
            chol_cov = tt.switch(ok, _chol_cov, 1)

            delta_trans = solve_lower(chol_cov, delta[:, i].T).T
            _quaddist = (delta_trans**2).sum(axis=-1)
            logdet = tt.sum(tt.log(diag))
            if onedim:
                quaddist = _quaddist[0]
            else:
                quaddist = _quaddist
            norm = -0.5 * k * floatX(np.log(2 * np.pi))
            logp = norm - 0.5 * quaddist - logdet
            safe_logp = tt.switch(alltrue_elemwise([ok]), logp,
                                  -np.inf)  # safe logp (-inf for invalid)
            logps.append(safe_logp)
        return tt.stacklists(logps).T

示例#7

0

显示文件

文件： layers.py 项目： yancz1989/text_warper

  def get_output_for(self, inputs, **kwargs):
    # see eq. (1) and sec 3.1 in [1]
    input, para = inputs
    num_batch, channels, height, width = input.shape
    _w = T.cast(width, dtype = self.dtype)
    _h = T.cast(height, dtype = self.dtype)
    mat = T.zeros((num_batch, 3, 3), dtype = self.dtype)
    mat = T.set_subtensor(mat[:, 0, 0], const(1.0))
    mat = T.set_subtensor(mat[:, 1, 1], const(1.0))
    mat = T.set_subtensor(mat[:, 2, 2], const(1.0))

    if self.method == 'perspective':
      mat = T.set_subtensor(mat[:, 2, 0], (para[:, 0] / 1e4 - 1e-3) * _w)
      mat = T.set_subtensor(mat[:, 2, 1], (para[:, 1] / 1e4 - 1e-3) * _h)
    elif self.method == 'angle':
      angle = T.cast(T.argmax(para, axis = 1), dtype = self.dtype) * np.pi / 90 - np.pi / 3.0
      # ss = np.sqrt(2.0)
      mat = T.set_subtensor(mat[:, :, :], T.stacklists([
        [T.cos(angle), T.sin(angle), -(T.cos(angle) * _w + T.sin(angle) * _h - _w) / (2.0 * _w)],
        [-T.sin(angle), T.cos(angle), -(-T.sin(angle) * _w + T.cos(angle) * _h - _h) / (2.0 * _h)],
        [constv(0, num_batch, self.dtype), constv(0, num_batch, self.dtype), constv(1, num_batch, self.dtype)]]).dimshuffle(2, 0, 1))
      # return [mat, _w, _h]
    elif self.method == 'all':
      mat = T.reshape(para, [-1, 3, 3])
      mat = T.set_subtensor(mat[:, 0, 2], mat[:, 0, 2] / T.cast(width, dtype))
      mat = T.set_subtensor(mat[:, 1, 2], mat[:, 1, 2] / T.cast(height, dtype))
      mat = T.set_subtensor(mat[:, 2, 0], mat[:, 2, 0] * T.cast(width, dtype))
      mat = T.set_subtensor(mat[:, 2, 1], mat[:, 2, 1] * T.cast(height, dtype))
    else:
      raise Exception('method not understood.')
    return transform_affine(mat, input, self.method, scale_factor = self.scale_factor)

示例#8

0

显示文件

def jacobian(f: Sequence[Callable],
             x: Any,
             constants: list = []) -> TensorVariable:
    sz = cast(
        int,
        shape(f))  # Theano is doing some implicit casting black magic here
    return tt.stacklists([grad(f[i], x) for i in range(sz)])

示例#9

0

显示文件

文件： utils.py 项目： dsadigh/Autonomous_Driving

def jacobian(f, x, constants=[]):
    sz = shape(f)
    return tt.stacklists([grad(f[i], x) for i in range(sz)])
    ret = th.gradient.jacobian(f, x, consider_constant=constants)
    if isinstance(ret, list):
        ret = tt.concatenate(ret, axis=1)
    return ret

示例#10

0

显示文件

文件： utils.py 项目： hashemelezabi/driving-interactions

def jacobian(f, x, constants=[]):
    sz = shape(f)
    return tt.stacklists([grad(f[i], x) for i in range(sz)])
    ret = th.gradient.jacobian(f, x, consider_constant=constants)
    if isinstance(ret, list):
        ret = tt.concatenate(ret, axis=1)
    return ret

示例#11

0

显示文件

文件： learningspeed.py 项目： bloodmage/libref

 def fixspeed(self,model,momentums):
     paramlayers = model.paramlayers()
     coeff = []
     outs = []
     pid = 0
     for paramlayer in paramlayers:
         #For W
         D = 0
         if isinstance(paramlayer,(layerbase.ConvLayer,layerbase.ConvMaxoutLayer,layerbase.ConvKeepLayer)):
             layershape = paramlayer.params[0].get_value().shape
             fan_in = layershape[1]*layershape[2]*layershape[3]
             fan_out = np.prod(layershape)
             D = 1
         elif isinstance(paramlayer,(layerbase.FullConnectLayer)):
             layershape = paramlayer.params[0].get_value().shape
             fan_in = layershape[0]
             fan_out = np.prod(layershape)
             D = 1
         else:
             coeff.append(1)
         if D:
             layerrate = (self.layertarget*fan_out/fan_in) / (T.sum(abs(momentums[pid]))+1e-10) * self.layerstr + self.baserate * (1-self.layerstr) * self.basedynamic
             coeff.append(layerrate)
             outs.append(momentums[pid]*layerrate)
             pid+=1
         #For other
         for i in paramlayer.params[D:]:
             outs.append(momentums[pid]*self.baserate)
             pid+=1
     return outs,T.stacklists(coeff)

示例#12

0

显示文件

文件： renet_mnist.py 项目： CesarChaMal/machine_learning_examples

def renet_layer_ud(X, Wx, Wh, Wo, Bh, Bo, H0, w, h, wp, hp):
    def recurrence(x_t, h_tm1):
        dot = T.dot(Wx, x_t)
        h_t = T.tanh(dot + T.dot(h_tm1, Wh) + Bh)
        s_t = T.tanh(T.dot(h_t, Wo) + Bo)
        return [h_t, s_t]

    list_of_images = []
    for j in xrange(w/wp):
        # x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten(ndim=2)
        # reshape the row into a 2-D matrix to be fed into scan
        x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten().reshape((h/hp, X.shape[0]*wp*hp))
        [h1, s1], _ = theano.scan(
            fn=recurrence,
            sequences=x,
            outputs_info=[H0, None],
            n_steps=x.shape[0]
        )
        [h2, s2], _ = theano.scan(
            fn=recurrence,
            sequences=x,
            outputs_info=[H0, None],
            n_steps=x.shape[0],
            go_backwards=True
        )
        # combine the last values of s1 and s2 into an image
        img = T.concatenate([s1.T, s2.T])
        list_of_images.append(img)

    return T.stacklists(list_of_images).dimshuffle((1, 0, 2))

示例#13

0

显示文件

 def f(x, u):
     return tt.stacklists([
         x[3]*tt.cos(x[2]),
         x[3]*tt.sin(x[2]),
         x[3]*u[0],
         u[1]-x[3]*friction
     ])

示例#14

0

显示文件

        def logp_(value):
            logps = [
                tt.log(pi[i]) + logp_normal(mu, sd, value)
                for i, mu in enumerate(mus)
            ]

            return tt.sum(logsumexp(tt.stacklists(logps)[:, :], axis=0))

示例#15

0

显示文件

文件： renet_mnist.py 项目： lambdaofgod/machine_learning_examples

def renet_layer_ud(X, Wx, Wh, Wo, Bh, Bo, H0, w, h, wp, hp):
    def recurrence(x_t, h_tm1):
        dot = T.dot(Wx, x_t)
        h_t = T.tanh(dot + T.dot(h_tm1, Wh) + Bh)
        s_t = T.tanh(T.dot(h_t, Wo) + Bo)
        return [h_t, s_t]

    list_of_images = []
    for j in range(w / wp):
        # x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten(ndim=2)
        # reshape the row into a 2-D matrix to be fed into scan
        x = X[:, :, j * wp:(j * wp + wp)].dimshuffle(
            (2, 0, 1)).flatten().reshape((h / hp, X.shape[0] * wp * hp))
        [h1, s1], _ = theano.scan(fn=recurrence,
                                  sequences=x,
                                  outputs_info=[H0, None],
                                  n_steps=x.shape[0])
        [h2, s2], _ = theano.scan(fn=recurrence,
                                  sequences=x,
                                  outputs_info=[H0, None],
                                  n_steps=x.shape[0],
                                  go_backwards=True)
        # combine the last values of s1 and s2 into an image
        img = T.concatenate([s1.T, s2.T])
        list_of_images.append(img)

    return T.stacklists(list_of_images).dimshuffle((1, 0, 2))

示例#16

0

显示文件

 def crossEntropy(self, y, m):
     return -T.sum(
         T.stacklists([
             T.mean(
                 T.log(self.p_y_given_x)[i][T.arange(y[i].shape[0]), y[i]] *
                 m[0]) for i in xrange(200)
         ]))

示例#17

0

显示文件

    def get_nade_k_rbm_cost_theano(self, x, input_mask, k):
        """
        log p(x_missing | x_observed)
        x is a matrix of column datapoints (mbxD)
        D = n_visible, mb = mini batch size
        """
        #x_ = utils.corrupt_with_salt_and_pepper(
        #    x, x.shape, self.noise, rng_theano)
        #BxD
        print 'building cost function ...'
        output_mask = constantX(1) - input_mask
        D = constantX(self.n_visible)
        d = input_mask.sum(1)
        cost = constantX(0)
        costs_by_step = []
        print 'do %d steps of mean field inference' % k
        P = self.get_nade_k_mean_field(x, input_mask, k)

        costs = []
        for i, p in enumerate(P):
            # Loglikelihood on missing bits
            lp = ((x*T.log(p) + (constantX(1)-x)*T.log(constantX(1)-p)) \
                  * output_mask).sum(1) * D / (D-d)
            this_cost = -T.mean(lp)
            costs.append(this_cost)
            costs_by_step.append(this_cost)
        costs_by_step = T.stack(costs_by_step)
        if not self.cost_from_last:
            cost = T.mean(T.stacklists(costs))
        else:
            cost = costs[-1]

        return cost, costs_by_step

示例#18

0

显示文件

文件： test0.py 项目： jackal092927/pylearn2_med

def get_sensi_speci(y_hat, y):
    # y_hat = T.concatenate(T.sum(input=y_hat[:, 0:2], axis=1), T.sum(input=y_hat[:, 2:], axis=1))
    y_hat = T.stacklists([y_hat[:, 0] + y_hat[:, 1], y_hat[:, 2] + y_hat[:, 3] + y_hat[:, 4]]).T
    y_hat = T.argmax(y_hat)

    tag = 10 * y_hat + y
    tneg = T.cast((T.shape(tag[(T.eq(tag, 0.)).nonzero()]))[0], config.floatX)
    fneg = T.cast((T.shape(tag[(T.eq(tag, 1.)).nonzero()]))[0], config.floatX)
    fpos = T.cast((T.shape(tag[(T.eq(tag, 10.)).nonzero()]))[0], config.floatX)
    tpos = T.cast((T.shape(tag[(T.eq(tag, 11.)).nonzero()]))[0], config.floatX)



    # assert fneg + fneg + fpos + tpos == 1380
    # tneg.astype(config.floatX)
    # fneg.astype(config.floatX)
    # fpos.astype(config.floatX)
    # tpos.astype(config.floatX)

    speci = ifelse(T.eq((tneg + fpos), 0), np.float64(float('inf')), tneg / (tneg + fpos))
    sensi = ifelse(T.eq((tpos + fneg), 0), np.float64(float('inf')), tpos / (tpos + fneg))

    # keng die!!!
    # if T.eq((tneg + fpos), 0):
    #     speci = float('inf')
    # else:
    #     speci = tneg // (tneg + fpos)
    # if T.eq((tpos + fneg), 0.):
    #     sensi = float('inf')
    # else:
    #     sensi = tpos // (tpos + fneg)

    # speci.astype(config.floatX)
    # sensi.astype(config.floatX)
    return [sensi, speci]

示例#19

0

显示文件

文件： g_model.py 项目： BIIG-UC3M/MMITB

    def logp_(value):
        logps = [
            tt.log(pi[c]) + logp_normal(mus[c], tau, value) for c in category
        ]

        return tt.sum(
            pm.math.logsumexp(tt.stacklists(logps)[:, :n_samples], axis=0))

示例#20

0

显示文件

文件： lib_transform.py 项目： DaveBoettger/mirrorfithmc

    def _generate(self):
        '''Turn the specification of the transform into usable mathematical objects'''

        identity = {'tx':0., 'ty':0., 'tz':0., 'rx':0., 'ry':0., 'rz':0., 's':self.full_scale}
        trans = identity 
        if self._trans is not None:
            for k in identity.keys():
                try:
                    trans[k] = self._trans[k]
                    if self._apply_factors_to_trans:
                        if k in ['tx', 'ty', 'tz']:
                            trans[k]*=self.translate_factor
                        elif k in ['rx', 'ry', 'rz']:
                            trans[k]*=self.rotation_scale
                        elif k == 's':
                            trans[k]*=self.full_scale
                except KeyError:
                    pass

        if self._R is None:
            self._R = theano_rot(rx=trans['rx'], ry=trans['ry'], rz=trans['rz'], rescale = self.rotation_scale)

        if self._tr is None:
            self._tr = tt.stacklists([trans['tx'],trans['ty'],trans['tz']])

        if self._s is None:
            self._s = trans['s']

示例#21

0

显示文件

文件： g_model.py 项目： BIIG-UC3M/MMITB

    def logp_(value):
        aux = tt.ones((n_samples, 1))
        pi = [tt.sum(tt.eq(aux3, aux * cat), axis=1) / 8.0 for cat in range(K)]

        logps = [(pi[i] - 1) * 2 + logp_normal(mu, tau, value)
                 for i, mu in enumerate(mus)]
        return tt.sum(tt.stacklists(logps), axis=0)

示例#22

0

显示文件

 def logp_(value):
     logps = [
         tt.log(pi[i]) + logp_normal(mus[i, :], taus[i], value)
         for i in range(n_components)
     ]
     return tt.sum(
         logsumexp(tt.stacklists(logps)[:, :n_samples], axis=0))

示例#23

0

显示文件

文件： layers.py 项目： mihail911/LSTM-NLI

    def applySentenceAttention(self, premiseOutputs, finalHypothesisOutput, numTimestepsPremise):
        """
        Apply sentence level attention by attending over all premise outputs
        once with the final hypothesis output. Note this is different from
        word-by-word attention over the premise.
        :param premiseOutputs:
        :param finalHypothesisOutput:
        :return:
        """
        # Note: Notation follows that in Rocktaschel's attention mechanism explanation:
        # http://arxiv.org/pdf/1509.06664v2.pdf
        timestep, numSamp, dimHidden = premiseOutputs.shape
        Y = premiseOutputs.reshape((numSamp, timestep, dimHidden))
        WyY = T.dot(Y, self.W_y)  # Computing (WyY).T

        transformedHn = (T.dot(self.W_h, finalHypothesisOutput.T)).T
        repeatedHn = [transformedHn] * numTimestepsPremise
        # TODO: Condense this later if it works
        repeatedHn = T.stacklists(repeatedHn)
        repeatedHn = repeatedHn.dimshuffle(1, 0, 2)  # (numSample, timestep, dimHidden)

        M = T.tanh(WyY + repeatedHn)
        alpha = T.nnet.softmax(T.dot(M, self.w).flatten(2))  # Hackery to make into 2d tensor of (numSamp, timestep)
        Y = Y.dimshuffle(0, 2, 1)
        rOut, updates = theano.scan(
            fn=lambda Yt, alphat: T.dot(Yt, alphat), outputs_info=None, sequences=[Y, alpha], non_sequences=None
        )
        WxHn = T.dot(finalHypothesisOutput, self.W_x)
        WpR = T.dot(rOut, self.W_p)
        hstar = T.tanh(WxHn + WpR)

        return hstar

示例#24

0

显示文件

文件： dynamics.py 项目： jordan-schneider/driver-env

 def f(state: np.ndarray, action: np.ndarray) -> tt.Tensor:
     return tt.stacklists([
         state[3] * tt.cos(state[2]),
         state[3] * tt.sin(state[2]),
         state[3] * action[0],
         action[1] - state[3] * friction,
     ])

示例#25

0

显示文件

    def _compute_nary_hessian_vector_product(self, gradients, arguments):
        """Returns a function accepting `2 * len(arguments)` arguments to
        compute a Hessian-vector product of a multivariate function.

        Notes
        -----
        The implementation is based on TensorFlow's '_hessian_vector_product'
        function in 'tensorflow.python.ops.gradients_impl'.
        """
        argument_types = [argument.type() for argument in arguments]
        try:
            Rop = T.Rop(gradients, arguments, argument_types)
        except NotImplementedError:
            proj = [
                T.sum(gradient * disconnected_grad(argument_type))
                for gradient, argument_type in zip(gradients, argument_types)
            ]
            proj_grad = [
                T.grad(proj_elem,
                       arguments,
                       disconnected_inputs="ignore",
                       return_disconnected="None") for proj_elem in proj
            ]
            proj_grad_transpose = map(list, zip(*proj_grad))
            proj_grad_stack = [
                T.stacklists([c for c in row if c is not None])
                for row in proj_grad_transpose
            ]
            Rop = [T.sum(stack, axis=0) for stack in proj_grad_stack]
        return self._compile_function_without_warnings(
            list(itertools.chain(arguments, argument_types)), Rop)

示例#26

0

显示文件

    def memnn_cost(self, statements, question, pe_matrix):
        # statements: list of list of word indices
        # question: list of word indices

        computed_memories, updates = theano.scan(
            self._compute_memories,
            sequences=statements,
            outputs_info=[
                #alloc_zeros_matrix(self.weights.shape[0])
                #alloc_zeros_matrix(self.weights.shape[0]),self.n_embedding)
                alloc_zeros_matrix(self.weights.shape[0], 4800)  #init as 3
                #alloc_zeros_matrix(self.weights.shape[0], 4800, 4) #init as 4
                #alloc_zeros_matrix(4)
                #alloc_zeros_matrix(110, 4800)
            ],
            non_sequences=[
                #self.weights.dimshuffle(1, 0, 2),
                #self.weights.dimshuffle(1, 0, 2),
                self.weights,
                pe_matrix
            ],
            truncate_gradient=-1,
        )

        #memories = computed_memories
        #memories = T.stacklists(computed_memories)
        memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2)
        #print computed_memories.shape[0]

        # Embed question
        #s = theano.tensor.scalar('s')
        #u1 = T.sum(self.weights[0][question], axis=0)
        #u1 = [question]
        u1 = question
        #u1 = u1.astype(np.float64)
        #u1 = np.asarray(u1, dtype=np.float64)
        #sv = skipthoughts.encode(model, sentence)

        # Layer 1
        p = T.nnet.softmax(T.dot(u1, memories[0].T))
        o1 = T.dot(p, memories[1])

        # Layer 2
        u2 = o1 + T.dot(u1, self.H)
        p = T.nnet.softmax(T.dot(u2, memories[1].T))
        o2 = T.dot(p, memories[2])

        # Layer 3
        u3 = o2 + T.dot(u2, self.H)
        p = T.nnet.softmax(T.dot(u3, memories[2].T))
        o3 = T.dot(p, memories[3])

        # Final
        output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T))

        print "memnn_cost running"

        #return output[0, 1, 2, 3]
        return output[0]

示例#27

0

显示文件

文件： linlog_model.py 项目： pstjohn/emll

    def solve_theano(self, A, bi):
        a = A[0, 0]
        b = A[0, 1]
        c = A[1, 0]
        d = A[1, 1]

        A_inv = (T.stacklists([[d, -b], [-c, a]]) / (a * d - b * c))
        return T.dot(A_inv, bi).squeeze()

示例#28

0

显示文件

文件： g_model.py 项目： BIIG-UC3M/MMITB

    def logp_(value):
        aux = tt.zeros((n_samples, 1))
        pi = [tt.sum(tt.eq(aux3, aux + cat), axis=1) / 8.0 for cat in range(K)]

        logps = [((pi[i] - 1) * 2 + mv.logp(value)) -
                 tt.sum((pi[i] - 1) * 2 + mv.logp(value))
                 for i, mv in enumerate(mus)]
        return tt.sum(tt.stacklists(logps), axis=0)

示例#29

0

显示文件

文件： layers.py 项目： karthiknrao/scarface

 def __call__(self,X):
     #out = self.W[:,X]
     def step(x):
         return self.W[:x]
     stk = theano.map( lambda x: self.W[x],X)
     out = T.stacklists(stk[0])
     #return out.dimshuffle('x','x',0,1)
     return out

示例#30

0

显示文件

文件： dcnn_train.py 项目： praisedev/trump-sentiment-calculator

    def __init__(
        self,
        rng,
        input,
        vocab_size,
        embed_dm,
        embeddings=None,
    ):
        """
        input: theano.tensor.dmatrix, (number of instances, sentence word number)
        
        vocab_size: integer, the size of vocabulary,

        embed_dm: integer, the dimension of word vector representation

        embeddings: theano.tensor.TensorType
        pretrained embeddings
        """
        if embeddings:
            print "Use pretrained embeddings: ON"
            assert embeddings.get_value().shape == (
                vocab_size,
                embed_dm), "%r != %r" % (embeddings.get_value().shape,
                                         (vocab_size, embed_dm))

            self.embeddings = embeddings
        else:
            print "Use pretrained embeddings: OFF"
            embedding_val = np.asarray(rng.normal(0,
                                                  0.05,
                                                  size=(vocab_size, embed_dm)),
                                       dtype=theano.config.floatX)

            embedding_val[
                vocab_size -
                1, :] = 0  # the <PADDING> character is intialized to 0

            self.embeddings = theano.shared(np.asarray(
                embedding_val, dtype=theano.config.floatX),
                                            borrow=True,
                                            name='embeddings')

        self.params = [self.embeddings]

        self.param_shapes = [(vocab_size, embed_dm)]

        # Return:

        # :type, theano.tensor.tensor4
        # :param, dimension(1, 1, word embedding dimension, number of words in sentence)
        #         made to be 4D to fit into the dimension of convolution operation
        sent_embedding_list, updates = theano.map(
            lambda sent: self.embeddings[sent], input)
        sent_embedding_tensor = T.stacklists(
            sent_embedding_list)  # make it into a 3D tensor

        self.output = sent_embedding_tensor.dimshuffle(
            0, 'x', 2, 1)  # make it a 4D tensor

示例#31

0

显示文件

文件： gridLstm.py 项目： Duum/GridLSTM

    def step(self, x_t, H_x, H_y, M_x, M_y, W_i, W_f, W_o, W_c):
        
        #H_t = T.ones_like(H_x)
        #M_t = T.ones_like(H_x)
        #H = T.ones_like(H_x)

        H = T.stacklists([x_t, H_x[1], H_y[2]])
        M = T.stacklists([x_t, M_x[1], M_y[2]])
        for i in range(self.n_dim):  
            (H_temp, M_temp) = self.LTSM(H, M[i], W_i[i], W_f[i], W_o[i], W_c[i])

            if (i == 0):
                H_t = H_temp
                M_t = M_temp
            else:
                H_t = T.concatenate([H_t, H_temp], axis=0)
                M_t = T.concatenate([M_t, M_temp], axis=0)
        return H_t, M_t

示例#32

0

显示文件

文件： finite_difference.py 项目： zhanghonglishanzai/drmad

def update_fun(param, grad, penaltyparam, dataset, history, opt, params, globalLR1, globalLR2, momentParam1, momentParam2):

        epsilon = np.asarray(0.0, dtype=theano.config.floatX)
        def separateLR(params, sharedName, globalLR1, globalLR2):
            sharedName = sharedName[:-2];customizedLR = globalLR2
            if (sharedName in params.rglrzLR.keys()) or (not params.adaptT2LR):
                customizedLR = globalLR2*params.rglrzLR[sharedName]
            return customizedLR      
 
        assert dataset in ['T1', 'T2']
        lr = globalLR1 if dataset == 'T1' else separateLR(params, param.name, globalLR1, globalLR2) 
 
        # Standard update
        if opt is None:
            updates = []
            if params.trackGrads:
                old_grad = theano.shared(np.asarray(param.get_value() * 0., dtype='float32'),
                                    broadcastable=param.broadcastable,
                                    name='oldgrad_%s' % param.name)
                updates += [(old_grad, grad)]
                grad_mean = T.mean(T.sqrt(grad**2))
                grad_rel = T.mean(T.sqrt((grad/(param+1e-12))**2))
                grad_angle = T.sum(grad*old_grad)/(T.sqrt(T.sum(grad**2))*T.sqrt(T.sum(old_grad**2))+1e-12) 
                check = T.stacklists([grad_mean, grad_rel, grad_angle])
                other = [grad]
            else:    
                check = grad
                other = [grad]
                          
            up = - lr * grad
        else:
            up, updates, check, other = opt.up(param, grad, params, lr=lr, dataset=dataset)

        # dictionary param to grad (first time around)
        if params.useT2 and dataset == 'T1':
            history['grad'][param] = grad
            history['up'][param] = up
        # add momentum to update
        if params.use_momentum:
            oldup = theano.shared(np.asarray(param.get_value() * 0., dtype='float32'),
                                  broadcastable=param.broadcastable,
                                  name='oldup_%s' % param.name)
            momentParam = momentParam1 if dataset == 'T1' else momentParam2
            up += momentParam * oldup
            updates += [(oldup, up)]

        # New parameter
        newparam = param + up
        # min value  |  NOTE assumption: all hyperparams can only be positive
        if dataset == 'T2':
            newparam = T.maximum(epsilon, newparam)

        updates += [(param, newparam)]
        paramUpPair = [(param, check)]
        adamGrad = [other]

        return updates, paramUpPair, adamGrad

示例#33

0

显示文件

 def step(*args):
     """
         z_tmp, ..., z_tm1 \in R^{1,n_hidden}
     """
     z_stack = T.stacklists(args)
     z_merge = z_stack * self.W
     z_t = T.sum(z_merge, axis=0)
     y_t = T.dot(z_t, self.W_o) + self.b_o
     return z_t, y_t

示例#34

0

显示文件

文件： finite_difference.py 项目： jelennal/t1t2

def update_fun(param, grad, penaltyparam, dataset, history, opt, params, globalLR1, globalLR2, momentParam1, momentParam2):

        epsilon = np.asarray(0.0, dtype=theano.config.floatX)
        def separateLR(params, sharedName, globalLR1, globalLR2):
            sharedName = sharedName[:-2];customizedLR = globalLR2
            if (sharedName in params.rglrzLR.keys()) or (not params.adaptT2LR):
                customizedLR = globalLR2*params.rglrzLR[sharedName]
            return customizedLR      
 
        assert dataset in ['T1', 'T2']
        lr = globalLR1 if dataset == 'T1' else separateLR(params, param.name, globalLR1, globalLR2) 
 
        # Standard update
        if opt is None:
            updates = []
            if params.trackGrads:
                old_grad = theano.shared(np.asarray(param.get_value() * 0., dtype='float32'),
                                    broadcastable=param.broadcastable,
                                    name='oldgrad_%s' % param.name)
                updates += [(old_grad, grad)]
                grad_mean = T.mean(T.sqrt(grad**2))
                grad_rel = T.mean(T.sqrt((grad/(param+1e-12))**2))
                grad_angle = T.sum(grad*old_grad)/(T.sqrt(T.sum(grad**2))*T.sqrt(T.sum(old_grad**2))+1e-12) 
                check = T.stacklists([grad_mean, grad_rel, grad_angle])
                other = [grad]
            else:    
                check = grad
                other = [grad]
                          
            up = - lr * grad
        else:
            up, updates, check, other = opt.up(param, grad, params, lr=lr, dataset=dataset)

        # dictionary param to grad (first time around)
        if params.useT2 and dataset == 'T1':
            history['grad'][param] = grad
            history['up'][param] = up
        # add momentum to update
        if params.use_momentum:
            oldup = theano.shared(np.asarray(param.get_value() * 0., dtype='float32'),
                                  broadcastable=param.broadcastable,
                                  name='oldup_%s' % param.name)
            momentParam = momentParam1 if dataset == 'T1' else momentParam2
            up += momentParam * oldup
            updates += [(oldup, up)]

        # New parameter
        newparam = param + up
        # min value  |  NOTE assumption: all hyperparams can only be positive
        if dataset == 'T2':
            newparam = T.maximum(epsilon, newparam)

        updates += [(param, newparam)]
        paramUpPair = [(param, check)]
        adamGrad = [other]

        return updates, paramUpPair, adamGrad

示例#35

0

显示文件

    def __call__(self, X):
        #out = self.W[:,X]
        def step(x):
            return self.W[:x]

        stk = theano.map(lambda x: self.W[x], X)
        out = T.stacklists(stk[0])
        #return out.dimshuffle('x','x',0,1)
        return out

示例#36

0

显示文件

文件： dynamics.py 项目： FrozenInc/bachelor_thesis_kth_2019

 def f(x, u):
     return tt.stacklists([
         ((u[1] - friction * x[3]**2) * dt**2 / 2 + x[3] * dt) *
         tt.cos(x[2]) + x[0],
         ((u[1] - friction * x[3]**2) * dt**2 / 2 + x[3] * dt) *
         tt.sin(x[2]) + x[1],
         ((u[1] - friction * x[3]**2) * dt**2 / 2 + x[3] * dt) * u[0] +
         x[2], (u[1] - friction * x[3]**2) * dt + x[3]
     ])

示例#37

0

显示文件

文件： dcnn_train.py 项目： QianchaoLiu/twitter-sent-dnn

    def __init__(self, rng, 
                 input,
                 vocab_size, 
                 embed_dm, 
                 embeddings = None,
    ):
        """
        input: theano.tensor.dmatrix, (number of instances, sentence word number)
        
        vocab_size: integer, the size of vocabulary,

        embed_dm: integer, the dimension of word vector representation

        embeddings: theano.tensor.TensorType
        pretrained embeddings
        """                
        if embeddings:
            print "Use pretrained embeddings: ON"
            assert embeddings.get_value().shape == (vocab_size, embed_dm), "%r != %r" %(
                embeddings.get_value().shape, 
                (vocab_size, embed_dm)
            )
            
            self.embeddings = embeddings
        else:
            print "Use pretrained embeddings: OFF"
            embedding_val = np.asarray(
                rng.normal(0, 0.05, size = (vocab_size, embed_dm)), 
                dtype = theano.config.floatX
            )
            
            embedding_val[vocab_size-1,:] = 0 # the <PADDING> character is intialized to 0
            
            self.embeddings = theano.shared(
                np.asarray(embedding_val, 
                           dtype = theano.config.floatX),
                borrow = True,
                name = 'embeddings'
            )

        
        self.params = [self.embeddings]
        
        self.param_shapes = [(vocab_size, embed_dm)]
        
        # Return:
        
        # :type, theano.tensor.tensor4
        # :param, dimension(1, 1, word embedding dimension, number of words in sentence)
        #         made to be 4D to fit into the dimension of convolution operation
        sent_embedding_list, updates = theano.map(lambda sent: self.embeddings[sent], 
                                                  input)
        sent_embedding_tensor = T.stacklists(sent_embedding_list) # make it into a 3D tensor
        
        self.output = sent_embedding_tensor.dimshuffle(0, 'x', 2, 1) # make it a 4D tensor

示例#38

0

显示文件

文件： utils.py 项目： sailfish009/DL4DistancePrediction2

def TestStack():
    x = T.matrix('x')
    y = T.matrix('y')
    z = T.matrix('z')
    f = theano.function([x, y, z], T.stacklists([x, y, z]))
    a = np.ones((5, 4), dtype=np.float32)
    b = np.ones((5, 4), dtype=np.float32)
    c = np.ones((5, 4), dtype=np.float32)
    d = f(a, b, c)
    print(d.shape)
    print(d)

示例#39

0

显示文件

def jacobian(f, x, constants=[]):
    #import pdb
    #pdb.set_trace()
    #sz = shape(f) #this produced a bug
    #sz = shape(f)[0] #alternative formulation found later in code, should get the same result
    sz = int(shape(f))  #put in in response to bug. This seems to work
    return tt.stacklists([grad(f[i], x) for i in range(sz)])
    ret = th.gradient.jacobian(f, x, consider_constant=constants)
    if isinstance(ret, list):
        ret = tt.concatenate(ret, axis=1)
    return ret

示例#40

0

显示文件

    def compute_hessian(self, objective, argument):
        """
        Computes the directional derivative of the gradient (which is equal to
        the Hessian multiplied by direction).
        """
        g = T.grad(objective, argument)

        # Create a new tensor A, which has the same type (i.e. same
        # dimensionality) as argument.
        is_product_manifold = isinstance(argument, (list, tuple))
        if not is_product_manifold:
            A = argument.type()
        else:
            A = [arg.type() for arg in argument]

        # First attempt efficient 'R-op', this directly calculates the
        # directional derivative of the gradient.
        try:
            R = T.Rop(g, argument, A)
        except NotImplementedError:
            # Implementation based on
            # tensorflow.python.ops.gradients_impl._hessian_vector_product
            if not is_product_manifold:
                proj = T.sum(g * disconnected_grad(A))
                R = T.grad(proj, argument)
            else:
                proj = [
                    T.sum(g_elem * disconnected_grad(a_elem))
                    for g_elem, a_elem in zip(g, A)
                ]
                proj_grad = [
                    T.grad(proj_elem,
                           argument,
                           disconnected_inputs="ignore",
                           return_disconnected="None") for proj_elem in proj
                ]
                proj_grad_transpose = map(list, zip(*proj_grad))
                proj_grad_stack = [
                    T.stacklists([c for c in row if c is not None])
                    for row in proj_grad_transpose
                ]
                R = [T.sum(stack, axis=0) for stack in proj_grad_stack]

        if not is_product_manifold:
            hess = theano.function([argument, A], R, on_unused_input="warn")
        else:
            hess_prod = theano.function(argument + A,
                                        R,
                                        on_unused_input="warn")

            def hess(x, a):
                return hess_prod(*(x + a))

        return hess

示例#41

0

显示文件

文件： test0.py 项目： jackal092927/pylearn2_med

def convert2class(self, y_hat, y):
    # y_hat = T.set_subtensor(y_hat[(y_hat < 1).nonzero()], 0)
    # y_hat = T.set_subtensor(y_hat[(y_hat >= 1).nonzero()], 1)
    # y_hat = T.stacklists([y_hat[:, 0] + y_hat[:, 1], y_hat[:, 2] + y_hat[:, 3] + y_hat[:, 4]])
    y_hat = T.stacklists([T.sum(y_hat[:, 0:2], axis=1), T.sum(y_hat[:, 2:], axis=1)]).T
    y_hat = T.argmax(y_hat, axis=1)
    # y_hat = T.set_subtensor(y_hat[(y_hat < 2).nonzero()], 0)
    # y_hat = T.set_subtensor(y_hat[(y_hat >= 2).nonzero()], 1)
    y = T.set_subtensor(y[(y < 2).nonzero()], 0)
    y = T.set_subtensor(y[(y >= 2).nonzero()], 1)

    return [y_hat, y]

示例#42

0

显示文件

文件： utils.py 项目： jiwoongim/DVAE

def special_SaP_noise_4_jyc(rng, input, corruption_level):

    # salt and pepper noise
    print 'DAE uses salt and pepper noise'
    a = MRG.binomial(size=input.shape, n=1,\
            p=1-corruption_level,dtype=theano.config.floatX)
    b = MRG.binomial(size=input.shape, n=1,\
            p=corruption_level,dtype=theano.config.floatX)

    c = T.eq(a,0) * b
    mask = - a + c
    CX = input * a + c
    return T.stacklists([X, noise_mask])

示例#43

0

显示文件

文件： recGenI128.py 项目： Thelordofdream/GRAN

    def get_samples(self, num_sam, scanF=True):
        """
        Retrieves the samples for the current time step. 
        uncomment parts when time step changes.
        """
        print 'Get_sample func: Number of steps iterate over ::: %d' % self.num_steps

        H_Ct    = T.alloc(0., num_sam, self.dim_sample)
        #Z       = MRG.normal(size=(num_sam, self.dim_sample), avg=0., std=1.)
        Zs      = MRG.normal(size=(self.num_steps, num_sam, self.dim_sample), avg=0., std=1.)

        Canvases = self.apply_recurrence(self.num_steps, Zs, H_Ct)
        C = T.sum(T.stacklists(Canvases),axis=0)
        return activation_fn_th(C, atype='sigmoid'), Canvases

示例#44

0

显示文件

文件： wmemnnmc.py 项目： chagge/qa-memnn

    def memnn_cost(self, statements, question, ans, pe_matrix):
        # statements: list of list of word indices
        # question: list of word indices

        computed_memories, updates = theano.scan(
            self._compute_memories,
            sequences = [statements],
            outputs_info = [
                alloc_zeros_matrix(self.weights.shape[0], self.n_embedding)
            ],
            non_sequences = [
                self.weights.dimshuffle(1, 0, 2),
                pe_matrix
            ],
            truncate_gradient = -1,
        )

        memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2)

        # Embed question
        u1 = T.sum(self.weights[0][question], axis=0)

        # Layer 1
        p = T.nnet.softmax(T.dot(u1, memories[0].T))
        o1 = T.dot(p, memories[1])

        # Layer 2
        u2 = o1 + T.dot(u1, self.H)
        p = T.nnet.softmax(T.dot(u2, memories[1].T))
        o2 = T.dot(p, memories[2])

        # Layer 3
        u3 = o2 + T.dot(u2, self.H)
        p = T.nnet.softmax(T.dot(u3, memories[2].T))
        o3 = T.dot(p, memories[3])

        # Score answers
        u4 = o3 + T.dot(u3, self.H)

        # Embed answer
        a1 = T.sum(self.A[ans[0]], axis=0)
        a2 = T.sum(self.A[ans[1]], axis=0)
        a3 = T.sum(self.A[ans[2]], axis=0)
        a4 = T.sum(self.A[ans[3]], axis=0)
        a = T.stack(a1, a2, a3, a4)
        scores = T.dot(T.dot(u4, self.U.T), T.dot(self.U, a.T))
        #scores = T.dot(T.dot(u4, self.U.T), T.dot(self.U, a.T))
        output = T.nnet.softmax(scores)

        return output[0]

示例#45

0

显示文件

文件： layers.py 项目： rbn42/LearningToDrive

 def _output(self, input,  *args, **kwargs):
     x = srng.uniform(size=(self.batch_size,), high=self.img_size)
     y = srng.uniform(size=(self.batch_size,), high=self.img_size)
     x = T.cast(x, 'int32')
     y = T.cast(y, 'int32')
     r = []
     for i in range(self.batch_size):
         item = input[i]
         item = T.concatenate(
             [item[:, x[i]:, :], item[:, :x[i], :]], axis=1)
         item = T.concatenate(
             [item[:, :, y[i]:], item[:, :, :y[i]]], axis=2)
         r.append(item)
     r = T.stacklists(r)
     return r

示例#46

0

显示文件

文件： STM_2D_BN.py 项目： FlamingHorizon/STM

    def ff_step(single_q, single_m, ev1, ev2, ev3, evo, single_proj):
    
        qemb_t = tensor.dot(single_q, tparams['ff_q_emb'])
        l1_t_linear = tensor.dot(single_proj, tparams['W_ff_h1']) + tensor.dot(qemb_t, tparams['W_ff_q'])
        print 'l1_t_linear.ndim: %d' %(l1_t_linear.ndim)
        e_l1_t_ = l1_t_linear.mean(axis=0)
        print 'e_l1_t_.ndim: %d' %(e_l1_t_.ndim)
        v_l1_t_ = ((l1_t_linear - e_l1_t_) ** 2).mean(axis=0)
        print 'v_l1_t_.ndim: %d' %(v_l1_t_.ndim)
        e_l1_t = tensor.switch(use_noise, e_l1_t_, ev1[0])
        print 'ev1[0].ndim: %d' %(ev1[0].ndim)
        print 'e_l1_t.ndim: %d' %(e_l1_t.ndim)
        v_l1_t = tensor.switch(use_noise, v_l1_t_, ev1[1])
        print 'ev1[1].ndim: %d' %(ev1[1].ndim)
        print 'v_l1_t.ndim: %d' %(v_l1_t.ndim)
        l1_t_hat = tparams['gamma_l1'] * ((l1_t_linear - e_l1_t) / (v_l1_t + 0.0001) ** 0.5) + tparams['b_ff_h1']
        print 'l1_t_hat.ndim: %d' %(l1_t_hat.ndim)
        h1_t = tensor.nnet.sigmoid(l1_t_hat)
        print 'h1_t.ndim: %d' %(h1_t.ndim)
        
        l2_t_linear = tensor.dot(h1_t, tparams['W_ff_h2'])
        e_l2_t_ = l2_t_linear.mean(axis=0)
        v_l2_t_ = ((l2_t_linear - e_l2_t_) ** 2).mean(axis=0)
        e_l2_t = tensor.switch(use_noise, e_l2_t_, ev2[0])
        v_l2_t = tensor.switch(use_noise, v_l2_t_, ev2[1])
        l2_t_hat = tparams['gamma_l2'] * ((l2_t_linear - e_l2_t) / (v_l2_t + 0.0001) ** 0.5) + tparams['b_l2']
        h2_t = tensor.nnet.sigmoid(l2_t_hat)

        l3_t_linear = tensor.dot(h2_t, tparams['W_ff_h3'])
        e_l3_t_ = l3_t_linear.mean(axis=0)
        v_l3_t_ = ((l3_t_linear - e_l3_t_) ** 2).mean(axis=0)
        e_l3_t = tensor.switch(use_noise, e_l3_t_, ev3[0])
        v_l3_t = tensor.switch(use_noise, v_l3_t_, ev3[1])
        l3_t_hat = tparams['gamma_l3'] * ((l3_t_linear - e_l3_t) / (v_l3_t + 0.0001) ** 0.5) + tparams['b_l3']
        h3_t = tensor.nnet.softplus(l3_t_hat)
        
        
        o_t_linear = tensor.dot(h3_t, tparams['W_ff_o'])
        e_o_t_ = o_t_linear.mean(axis=0)
        v_o_t_ = ((o_t_linear - e_o_t_) ** 2).mean(axis=0)
        e_o_t = tensor.switch(use_noise, e_o_t_, evo[0])
        v_o_t = tensor.switch(use_noise, v_o_t_, evo[1])
        o_t_hat = tparams['gamma_o'] * ((o_t_linear - e_o_t) / (v_o_t + 0.0001) ** 0.5) + tparams['b_ff_o']
        o_t = o_t_hat * single_m[:,None]
        
        return o_t, qemb_t, single_proj, h1_t, h2_t, h3_t, tensor.stacklists([e_l1_t, v_l1_t]), tensor.stacklists([e_l2_t, v_l2_t]), tensor.stacklists([e_l3_t, v_l3_t]), tensor.stacklists([e_o_t, v_o_t])

示例#47

0

显示文件

文件： sequence_softmax.py 项目： mfs6174/Deep6174

    def __init__(self, input_train, input_test,
                 input_shape, seq_max_len,
                 n_out=10):
        super(SequenceSoftmax, self).__init__(None, input_train, input_test)
        self.n_softmax = seq_max_len + 1
        self.input_shape = input_shape
        n_in = np.prod(input_shape[1:])
        self.n_out = n_out

        # generate n_softmax W matrices
        def gen_W(out, k):
            return theano.shared(value=np.zeros((n_in, out),
                                             dtype=theano.config.floatX),
                            name='W' + str(k), borrow=True)
        self.Ws = [gen_W(seq_max_len, 0)]
        self.Ws.extend([gen_W(self.n_out, _ + 1) for _ in range(seq_max_len)])

        # generate n_softmax b vectors
        def gen_b(out, k):
            return theano.shared(value=np.zeros((out,),
                                                   dtype=theano.config.floatX),
                                 name='b' + str(k), borrow=True)

        self.bs = [gen_b(seq_max_len, 0)]
        self.bs.extend([gen_b(n_out, _ + 1) for _ in range(seq_max_len)])

        assert len(self.Ws) == self.n_softmax
        assert len(self.bs) == self.n_softmax

        # p_y_given_x[k]: kth output for all y, each of size (batch_size * n_out)
        self.p_y_given_x = [T.nnet.softmax(T.dot(self.input_test, self.Ws[k]) +
                                            self.bs[k]) for k in
                             xrange(self.n_softmax)]

        # self.pred[idx]: output labels of the 'idx' input
        self.pred = [T.argmax(self.p_y_given_x[k], axis=1) for k in
                     xrange(self.n_softmax)]
        self.pred = T.stacklists(self.pred).dimshuffle(1, 0)

        if self.has_dropout_input:
            self.p_y_given_x = [T.nnet.softmax(T.dot(self.input_train, self.Ws[k]) +
                                                self.bs[k]) for k in
                                 xrange(self.n_softmax)]

        self.params = copy(self.Ws)
        self.params.extend(self.bs)

示例#48

0

显示文件

文件： wmemnn.py 项目： ethancaballero/Skip-Thought_Memory_Networks

    def memnn_cost(self, statements, question, pe_matrix):

        computed_memories, updates = theano.scan(
            self._compute_memories,
            sequences = statements,
            outputs_info = [ 
                alloc_zeros_matrix(self.weights.shape[0], 4800)   #init as 3
            ],
            non_sequences = [
                #self.weights.dimshuffle(1, 0, 2),
                self.weights,
                pe_matrix
            ],
            truncate_gradient = -1,
        )

        memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2)

        # Embed question
        #s = theano.tensor.scalar('s')
        u1 = question
        #u1 = weights[0] * question

        #sv = skipthoughts.encode(model, sentence)

        # Layer 1
        p = T.nnet.softmax(T.dot(u1, memories[0].T))
        o1 = T.dot(p, memories[1])

        # Layer 2
        u2 = o1 + T.dot(u1, self.H)
        p = T.nnet.softmax(T.dot(u2, memories[1].T))
        o2 = T.dot(p, memories[2])

        # Layer 3
        u3 = o2 + T.dot(u2, self.H)
        p = T.nnet.softmax(T.dot(u3, memories[2].T))
        o3 = T.dot(p, memories[3])

        # Final
        output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T))

        print "memnn_cost running"

        #return output[0, 1, 2, 3]
        return output[0]

示例#49

0

显示文件

文件： logistic.py 项目： peskotivesgeroff/MedusaLafayetteDecorusScheisse

    def __init__(self, input_list, n_in, n_out, n_total, mask, batch, W=None, b=None, M=None):
        w = np.zeros((n_in, n_out))
        np.fill_diagonal(w, 1)
        if W is None:
            #W = theano.shared(np.random.randn(n_in, n_out).astype(dtype=theano.config.floatX)/np.sqrt(n_in))
            W = theano.shared(w.astype(dtype=theano.config.floatX)/np.sqrt(n_in))
        if b is None:
            b = theano.shared(np.zeros(n_out).astype(dtype=theano.config.floatX))
        if M is None:
            M = theano.shared(0.5 * np.ones((n_total, 2)).astype(dtype=theano.config.floatX))
        self.W = W
        self.b = b
        self.M = M
        self.v_W = theano.shared(np.zeros((n_in, n_out)).astype(dtype=theano.config.floatX))
        self.v_b = theano.shared(np.zeros(n_out).astype(dtype=theano.config.floatX))
        self.v_M = theano.shared(np.zeros((n_total, 2)).astype(dtype=theano.config.floatX))
        self.input_list = input_list
        self.input_list[0] = self.input_list[0]
        self.input_list[1] = (self.input_list[1])[::-1]
        '''
        def Merge(input_seq1, input_seq2, merger):
            return T.dot((input_seq1 * merger[0] + input_seq2 * merger[1]), self.W) + self.b
        self.temp_y = a.softmax((theano.scan(Merge,
            sequences=[self.input_list[0], self.input_list[1], self.M],
                outputs_info=None))[0])
        '''

        def Merge(input_seq1, input_seq2):
            return T.dot((input_seq1 * 1 + input_seq2 * 0), self.W) + self.b

        self.temp_y = a.softmax((theano.scan(Merge,
            sequences=[self.input_list[0], self.input_list[1]],
                outputs_info=None))[0])


        self.temp_y = self.temp_y.dimshuffle(1,0,2)
        self.mask = mask
        self.batch = batch
        y_pred_list = []
        for i in range(self.batch):
            y_pred_list.append(T.set_subtensor(T.argmax(self.temp_y[i], axis=1)[self.mask[i]:], 0))
        self.y_pred = T.stacklists(y_pred_list)

        self.params = [self.W, self.b, self.M]
        self.velo = [self.v_W, self.v_b, self.v_M]

示例#50

0

显示文件

文件： monitor.py 项目： jelennal/t1t2

def grad_monitor(param, grad, updates, params, opt, g_t=0., m=0., v=0., e=1e-10):
    
    zero = np.float32(0.); eps = 1e-10
    old_grad = theano.shared(np.float32(param.get_value()) * zero, name="old_grad_%s" % param.name)
    updates.append((old_grad, grad))
    sharedName, _ = param.name.split('_')
    
    # tracked gradient values when adaptive learning rate
    if opt == 'adam':
        old_g_t = m/(T.sqrt(v) + e) 
        all_grads = {
            'grad' : T.mean(T.sqrt(grad**2)),
#            'grad_rel' : T.mean(T.sqrt((grad/(param+1e-12))**2)),
            'grad_angle' : T.sum(grad*old_grad)/(T.sqrt(T.sum(grad**2))*T.sqrt(T.sum(old_grad**2))+eps) ,
#            'grad_max' : T.max(T.sqrt(grad**2)),
            'p_t' : T.mean(T.sqrt((g_t)**2)),
#            'p_t_rel' : T.mean(T.sqrt((g_t/(param+1e-12))**2)),
            'p_t_angle' : T.sum(g_t*old_g_t)/(T.sqrt(T.sum(g_t**2))*T.sqrt(T.sum(old_g_t**2)+eps)),
#            'p_t_max' : T.max(T.sqrt(grad**2))
            }
                
    # tracked gradient values when regular SGD (+momentum)    
    elif opt == None:
        all_grads = {
            'grad' : T.mean(T.sqrt(grad**2)),
#            'grad_rel' : T.mean(T.sqrt((grad/(param+1e-12))**2)),
            'grad_angle' : T.sum(grad*old_grad)/(T.sqrt(T.sum(grad**2))*T.sqrt(T.sum(old_grad**2))+eps) ,
#            'grad_max' : T.max(T.sqrt(grad**2))
            }

    # store tracked grads for output
    temp = []
    if params.listGrads == 'all':
        for grad_type in all_grads.keys():
            temp += [all_grads[grad_type]] 
    else:
        for grad_type in filter(lambda name: name in all_grads.keys(), params.listGrads):
            temp += [all_grads[grad_type]] 

    trackGrads = T.stacklists(temp)        
    return updates, trackGrads

示例#51

0

显示文件

文件： variance_aggregation.py 项目： anirudh9119/SpeechSyn

    def get_aggregator(self):
        initialized = shared_like(0.)
        numerator_acc = shared_like(self.numerator)
        denominator_acc = shared_like(self.denominator)
        squared_num_acc = shared_like(self.squared_num)

        conditional_update_num = ifelse(initialized,
                                        self.numerator + numerator_acc,
                                        self.numerator)
        conditional_update_den = ifelse(initialized,
                                        self.denominator + denominator_acc,
                                        self.denominator)
        conditional_update_sqn = ifelse(initialized,
                                        self.squared_num + squared_num_acc,
                                        self.squared_num)

        initialization_updates = [(numerator_acc,
                                   tensor.zeros_like(numerator_acc)),
                                  (denominator_acc,
                                   tensor.zeros_like(denominator_acc)),
                                  (squared_num_acc,
                                   tensor.zeros_like(squared_num_acc)),
                                  (initialized, 0.)]
        accumulation_updates = [(numerator_acc,
                                 conditional_update_num),
                                (denominator_acc,
                                 conditional_update_den),
                                (squared_num_acc,
                                 conditional_update_sqn),
                                (initialized, 1.)]
        readout_variable = tensor.stacklists([(numerator_acc /
                                                denominator_acc),
                                              ((squared_num_acc /
                                                denominator_acc) -
                                               (numerator_acc /
                                                denominator_acc)**2)])
        aggregator = Aggregator(aggregation_scheme=self,
                                initialization_updates=initialization_updates,
                                accumulation_updates=accumulation_updates,
                                readout_variable = readout_variable)
        return aggregator

示例#52

0

显示文件

文件： nnet.py 项目： JonathanRaiman/gradient_optimizers

def multi_grad(costs, params):
    """
    Computes the gradient for several different costs
    separately and provides a rank+1 parameter gradient
    for each gradient with dimension 0 corresponding to
    a different cost.

    """
    all_grads = []
    for param in params:
        if len(costs) > 1:
            param_grads = []
            for cost in costs:
                gparam = T.grad(cost, param)
                param_grads.append(gparam)
            all_grads.append(T.stacklists(param_grads))
        else:
            gparam = T.grad(costs[0], param)
            all_grads.append(gparam)
        
    return all_grads

示例#53

0

显示文件

文件： wmemnn.py 项目： chagge/qa-memnn

    def memnn_cost(self, statements, question, pe_matrix):
        # statements: list of list of word indices
        # question: list of word indices

        computed_memories, updates = theano.scan(
            self._compute_memories,
            sequences = [statements],
            outputs_info = [
                alloc_zeros_matrix(self.weights.shape[0], self.n_embedding)
            ],
            non_sequences = [
                self.weights.dimshuffle(1, 0, 2),
                pe_matrix
            ],
            truncate_gradient = -1,
        )

        memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2)

        # Embed question
        u1 = T.sum(self.weights[0][question], axis=0)

        # Layer 1
        p = T.nnet.softmax(T.dot(u1, memories[0].T))
        o1 = T.dot(p, memories[1])

        # Layer 2
        u2 = o1 + T.dot(u1, self.H)
        p = T.nnet.softmax(T.dot(u2, memories[1].T))
        o2 = T.dot(p, memories[2])

        # Layer 3
        u3 = o2 + T.dot(u2, self.H)
        p = T.nnet.softmax(T.dot(u3, memories[2].T))
        o3 = T.dot(p, memories[3])

        # Final
        output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T))

        return output[0]

示例#54

0

显示文件

文件： renet.py 项目： CesarChaMal/machine_learning_examples

def renet_layer_lr_noscan(X, rnn1, rnn2, w, h, wp, hp):
    list_of_images = []
    for i in xrange(h/hp):
        # x = X[:,i*hp:(i*hp + hp),:].dimshuffle((2, 0, 1)).flatten().reshape((w/wp, X.shape[0]*wp*hp))
        h_tm1 = rnn1.H0
        hr_tm1 = rnn2.H0
        h1 = []
        h2 = []
        for j in xrange(w/wp):
            x = X[:,i*hp:(i*hp + hp),j*wp:(j*wp + wp)].flatten()
            h_t = rnn1.recurrence(x, h_tm1)
            h1.append(h_t)
            h_tm1 = h_t

            jr = w/wp - j - 1
            xr = X[:,i*hp:(i*hp + hp),jr*wp:(jr*wp + wp)].flatten()
            hr_t = rnn2.recurrence(x, hr_tm1)
            h2.append(hr_t)
            hr_tm1 = hr_t
        img = T.concatenate([h1, h2])
        list_of_images.append(img)
    return T.stacklists(list_of_images).dimshuffle((1, 0, 2))

示例#55

0

显示文件

文件： LSTM_STM_GRID_TIED.py 项目： FlamingHorizon/STM

def grid_lstm_cube(tparams, origin_data, options, prefix='lstm', mask=None):
    # size_1 = origin_data.shape[0]
    size_1 = options['grid_depth_1']
    size_2 = options['grid_depth_2']
    size_3 = options['grid_depth_3']
    dim_hidden = options['dim_hidden']
    if origin_data.ndim == 3:
        n_samples = origin_data.shape[1]
    else:
        n_samples = 1
    assert mask is not None
    
    
    input_data = tensor.dot(origin_data, tparams[_p(prefix, 'W')])

    h_list_all = [] # four dim tensor of hidden states
    c_list_all = []
    h_input_all = []
    for i in range(size_1):
        h_list_all.append([])
        c_list_all.append([])
        for j in range(size_2):
            h_list_all[i].append([])
            c_list_all[i].append([])
            for k in range(size_3):
                #print i, j, k
                if i < 1:
                    h_1 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                    c_1 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                else:
                    h_1 = h_list_all[i-1][j][k][0]
                    c_1 = c_list_all[i-1][j][k][0]

                if j < 1:
                    c_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                    #if k >= 1:
                    #    h_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                    #else:
                    #    h_2 = input_data[i]
                    #    h_input_all.append(h_2)
                    h_2 = input_data[i]
                    h_input_all.append(h_2)

                        #h_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                else:
                    h_2 = h_list_all[i][j-1][k][1]
                    c_2 = c_list_all[i][j-1][k][1]

                if k < 1:
                    c_3 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                    h_3 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                else:
                    h_3 = h_list_all[i][j][k-1][2]
                    c_3 = c_list_all[i][j][k-1][2]

                h1, h2, h3, c1, c2, c3 = grid_lstm_block(tparams, h_1, h_2, h_3, c_1, c_2, c_3, options, prefix, mask[i, :])
                #print h1.ndim, h2.ndim, h3.ndim
                h_list_sides = tensor.stack([h1, h2, h3])
                #print h_list_sides.ndim
                h_list_all[i][j].append(h_list_sides)
                c_list_sides = tensor.stack([c1, c2, c3])
                c_list_all[i][j].append(c_list_sides)
    
    out_list = [h_list_all[i][-1][-1][2] for i in range(size_1)] # h_list_all: first three are cube index. last is the output dimension of that block, from 0 to 2.
    print 'every h to stack is in dim: %d' %(h_list_all[-1][-1][-1][2].ndim)
    proj = tensor.stack(out_list)
    print 'proj.ndim is %d' %(proj.ndim)
    all_medium_states = tensor.stacklists(h_list_all)
    print 'all_medium_states.ndim is %d' %(all_medium_states.ndim)
    h_input_all = tensor.stacklists(h_input_all)
    return proj, all_medium_states, h_input_all

示例#56

0

显示文件

文件： MoG_pymc3.py 项目： wjfletcher91/MSc_project

 def logp_(value):
     logps = [tt.log(pi[k]) + logp_normal(mus[k], taus[k], value)
              for k in range(K)]
     return tt.sum(logsumexp(tt.stacklists(logps)[:,:n_samples], axis=0))

示例#57

0

显示文件

文件： wmemnn.py 项目： ethancaballero/first_stmn

    def memnn_cost(self, statements, question, pe_matrix):
        # statements: list of list of word indices
        # question: list of word indices

        computed_memories, updates = theano.scan(
            self._compute_memories,
            sequences = statements,
            outputs_info = [
                #alloc_zeros_matrix(self.weights.shape[0])
                #alloc_zeros_matrix(self.weights.shape[0]),self.n_embedding) 
                alloc_zeros_matrix(self.weights.shape[0], 4800)   #init as 3
                #alloc_zeros_matrix(self.weights.shape[0], 4800, 4) #init as 4
                #alloc_zeros_matrix(4)
                #alloc_zeros_matrix(110, 4800)
            ],
            non_sequences = [
                #self.weights.dimshuffle(1, 0, 2),
                #self.weights.dimshuffle(1, 0, 2),
                self.weights,
                pe_matrix
            ],
            truncate_gradient = -1,
        )

        #memories = computed_memories
        #memories = T.stacklists(computed_memories)
        memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2)
        #print computed_memories.shape[0]


        # Embed question
        #s = theano.tensor.scalar('s')
        #u1 = T.sum(self.weights[0][question], axis=0)
        #u1 = [question]
        u1 = question
        #u1 = u1.astype(np.float64)
        #u1 = np.asarray(u1, dtype=np.float64)
        #sv = skipthoughts.encode(model, sentence)


        # Layer 1
        p = T.nnet.softmax(T.dot(u1, memories[0].T))
        o1 = T.dot(p, memories[1])

        # Layer 2
        u2 = o1 + T.dot(u1, self.H)
        p = T.nnet.softmax(T.dot(u2, memories[1].T))
        o2 = T.dot(p, memories[2])

        # Layer 3
        u3 = o2 + T.dot(u2, self.H)
        p = T.nnet.softmax(T.dot(u3, memories[2].T))
        o3 = T.dot(p, memories[3])

        # Final
        output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T))

        print "memnn_cost running"

        #return output[0, 1, 2, 3]
        return output[0]

示例#58

0

显示文件

文件： STM_2D_BN.py 项目： FlamingHorizon/STM

def grid_lstm_cube(use_noise, population, tparams, origin_data, options, prefix='lstm', mask=None):
    # size_1 = origin_data.shape[0]
    size_1 = options['grid_depth_1']
    size_2 = options['grid_depth_2']
    # size_3 = options['grid_depth_3']
    dim_hidden = options['dim_hidden']
    if origin_data.ndim == 3:
        n_samples = origin_data.shape[1]
    else:
        n_samples = 1
    assert mask is not None
    
    
    input_data = tensor.dot(origin_data, tparams[_p(prefix, 'W')])

    h_list_all = [] # four dim tensor of hidden states
    c_list_all = []
    h_input_all = []
    bnstates_all = []
    for i in range(size_1):
        h_list_all.append([])
        c_list_all.append([])
        bnstates_all.append([])
        for j in range(size_2):
            if i < 1:
                h_1 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                c_1 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
            else:
                h_1 = h_list_all[i-1][j][0]
                c_1 = c_list_all[i-1][j][0]

            if j < 1:
                c_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                    #if k >= 1:
                    #    h_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                    #else:
                    #    h_2 = input_data[i]
                    #    h_input_all.append(h_2)
                    #    #h_2 = tensor.alloc(numpy_floatX(0.), n_samples, dim_hidden)
                h_2 = input_data[i]
                h_input_all.append(h_2)
            else:
                h_2 = h_list_all[i][j-1][1]
                c_2 = c_list_all[i][j-1][1]

            h1, h2, c1, c2, bnstates = grid_lstm_block(use_noise, population, i, j, tparams, h_1, h_2, c_1, c_2, options, prefix, mask[i, :])
                #print h1.ndim, h2.ndim, h3.ndim
            h_list_sides = tensor.stacklists([h1, h2])
                #print h_list_sides.ndim
            h_list_all[i].append(h_list_sides)
            c_list_sides = tensor.stacklists([c1, c2])
            c_list_all[i].append(c_list_sides)
            print type(bnstates)
            print 'bnstates[1].ndim is %d' %(bnstates[1].ndim)
            bnstates_ = tensor.stacklists(bnstates)
            bnstates_all[i].append(bnstates_)
    
    out_list_1 = [h_list_all[i][-1][1] for i in range(size_1)] # h_list_all: first three are cube index. last is the output dimension of that block, from 0 to 1.
    out_list_0 = [h_list_all[i][-1][0] for i in range(size_1)]
    out_list_c1 = [c_list_all[i][-1][1] for i in range(size_1)]
    out_list_c0 = [c_list_all[i][-1][0] for i in range(size_1)]
    print 'every h to stacklists is in dim: %d' %(h_list_all[-1][-1][1].ndim)
    proj_h1 = tensor.stacklists(out_list_1)
    proj_h0 = tensor.stacklists(out_list_0)
    proj_c1 = tensor.stacklists(out_list_c1)
    proj_c0 = tensor.stacklists(out_list_c0)
    proj = tensor.concatenate([proj_h1, proj_h0, proj_c1, proj_c0], axis=2)
    print 'proj.ndim is %d' %(proj.ndim)
    all_medium_states = tensor.stacklists(h_list_all)
    all_bn_states = tensor.stacklists(bnstates_all)
    print 'all_medium_states.ndim is %d' %(all_medium_states.ndim)
    h_input_all = tensor.stacklists(h_input_all)
    return proj, all_medium_states, h_input_all, all_bn_states

示例#59

0

显示文件

文件： dmn_untied.py 项目： QingGeGe/Improved-Dynamic-Memory-Networks-DMN-plus

    def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, sent_vector_size, 
                dim, mode, answer_module, input_mask_mode, memory_hops, l2, 
                normalize_attention, batch_norm, dropout, dropout_in, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.vocab = {None: 0}
        self.ivocab = {0: None}
        
        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.sent_vector_size = sent_vector_size
        self.dim = dim
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout
        self.dropout_in = dropout_in

        self.max_inp_sent_len = 0
        self.max_q_len = 0

        """
        #To Use All Vocab
        self.vocab = {None: 0, 'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0}
        self.ivocab = {0: None, 1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'}
        #self.vocab = {'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0}
        #self.ivocab = {1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'}
        #"""
        
        self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(babi_test_raw)
        self.vocab_size = len(self.vocab)

        self.input_var = T.imatrix('input_var')
        self.q_var = T.ivector('question_var')
        self.answer_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')
        
        self.attentions = []

        self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len)
        self.pe_matrix_q = self.pe_matrix(self.max_q_len)

            
        print "==> building input module"

        #positional encoder weights
        self.W_pe = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim))

        #biGRU input fusion weights
        self.W_inp_res_in_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_inp_res_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        
        self.W_inp_upd_in_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        
        self.W_inp_hid_in_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim,))

        self.W_inp_res_in_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_inp_res_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        
        self.W_inp_upd_in_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        
        self.W_inp_hid_in_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim,))

        #self.V_f = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #self.V_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))

        self.inp_sent_reps, _ = theano.scan(
                                fn=self.sum_pos_encodings_in,
                                sequences=self.input_var)

        self.inp_sent_reps_stacked = T.stacklists(self.inp_sent_reps)
        #self.inp_c = self.input_module_full(self.inp_sent_reps_stacked)

        self.inp_c = self.input_module_full(self.inp_sent_reps)

        self.q_q = self.sum_pos_encodings_q(self.q_var)
                
        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,))
        
        self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,))
        
        self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,))
        
        #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0))
        self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, 4 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, 1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, 1,))


        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            self.mem_weight_num = int(iter - 1)
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(self.GRU_update(memory[iter - 1], current_episode,
                                          self.W_mem_res_in[self.mem_weight_num], self.W_mem_res_hid[self.mem_weight_num], self.b_mem_res[self.mem_weight_num], 
                                          self.W_mem_upd_in[self.mem_weight_num], self.W_mem_upd_hid[self.mem_weight_num], self.b_mem_upd[self.mem_weight_num],
                                          self.W_mem_hid_in[self.mem_weight_num], self.W_mem_hid_hid[self.mem_weight_num], self.b_mem_hid[self.mem_weight_num]))
        
        last_mem_raw = memory[-1].dimshuffle(('x', 0))
        
        net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net)[0]
        
        print "==> building answer module"
        self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim))
        
        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))
        
        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,))
            
            self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,))
            
            self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        
            def answer_step(prev_a, prev_y):
                a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
                                  self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, 
                                  self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                                  self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid)
                
                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]
            
            # add conditional ending?
            dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX))
            
            results, updates = theano.scan(fn=answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],
                n_steps=1)
            self.prediction = results[1][-1]
        
        else:
            raise Exception("invalid answer_module")
        
        
        print "==> collecting all parameters"
        self.params = [self.W_pe,
                  self.W_inp_res_in_fwd, self.W_inp_res_hid_fwd, self.b_inp_res_fwd, 
                  self.W_inp_upd_in_fwd, self.W_inp_upd_hid_fwd, self.b_inp_upd_fwd,
                  self.W_inp_hid_in_fwd, self.W_inp_hid_hid_fwd, self.b_inp_hid_fwd,
                  self.W_inp_res_in_bwd, self.W_inp_res_hid_bwd, self.b_inp_res_bwd, 
                  self.W_inp_upd_in_bwd, self.W_inp_upd_hid_bwd, self.b_inp_upd_bwd,
                  self.W_inp_hid_in_bwd, self.W_inp_hid_hid_bwd, self.b_inp_hid_bwd, 
                  self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, 
                  self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
                  self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b
                  self.W_1, self.W_2, self.b_1, self.b_2, self.W_a]

        if self.answer_module == 'recurrent':
            self.params = self.params + [self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, 
                              self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                              self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid]
        
        
        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(self.prediction.dimshuffle('x', 0), 
                                                       T.stack([self.answer_var]))[0]

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0
        
        self.loss = self.loss_ce + self.loss_l2
        
        #updates = lasagne.updates.adadelta(self.loss, self.params)
        updates = lasagne.updates.adam(self.loss, self.params)
        updates = lasagne.updates.adam(self.loss, self.params, learning_rate=0.0001, beta1=0.5) #from DCGAN paper
        #updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate=0.0005)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003)
        
        self.attentions = T.stack(self.attentions)
        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var], 
                                            outputs=[self.prediction, self.loss, self.attentions],
                                            updates=updates,
                                            on_unused_input='warn',
                                            allow_input_downcast=True)
        
        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var],
                                       outputs=[self.prediction, self.loss, self.attentions],
                                       on_unused_input='warn',
                                       allow_input_downcast=True)