示例#1
0
    def __init__(self, input, We):
        """
        Input = a list (minibatch) of lists of indexes (pre-processed sentences)
        We = a word embedding matrix (vocabulary * dimensions)
        """

        # initialise the word embeddings
        self.We = theano.shared(numpy.asarray(We, dtype=theano.config.floatX),
                                name='We',
                                borrow=True)
        # Mapping to vector:
        # from: input (batch_size * indices)
        # to: vectors (batch_size * indices * dimensions)
        lookup = self.We[input]
        # This step concatenates along the vector-dimension axis three versions of the lookup 3D tensor:
        # 1) lookup 'rolled' forwards by 1 along the indices axis, 2) original tesor 3) tensor shifted backwards by 1
        # Note that in 1) and 3) a 0-valued vector represent sentence boundaries.
        forwards = T.set_subtensor(T.roll(lookup, 1, axis=1)[:, 0], 0.)
        backwards = T.set_subtensor(T.roll(lookup, -1, axis=1)[:, -1], 0.)
        window_processing = T.concatenate([forwards, lookup, backwards],
                                          axis=2)
        # I/O
        self.input = input
        self.output = window_processing
        # parameters of the model
        self.params = [self.We]
示例#2
0
    def get_output_for(self, input, **kwargs):
        def norm_fn(f, mask, label, previous, W_sim):
            # f: batch * class, mask: batch, label: batch, previous: batch * class, W_sim: class * class
            # previous: batch * class

            next = previous.dimshuffle(0, 1, 'x') + f.dimshuffle(0, 'x', 1) + W_sim.dimshuffle('x', 0, 1) # batch * class * class
            next = theano_logsumexp(next, axis = 1) # batch * class
            mask = mask.dimshuffle(0, 'x')
            next = previous * (1.0 - mask) + next * mask
            return next

        f = input # batch * time * class
        if self.end_points:
            for i in range(self.num_classes):
                f = T.inc_subtensor(f[:, 0, i], self.W_end_points[0, i])
                f = T.inc_subtensor(f[:, -1, i], self.W_end_points[1, i])

        initial = f[:, 0, :]
        outputs, _ = theano.scan(fn = norm_fn, \
         sequences = [f.dimshuffle(1, 0, 2)[1: ], self.mask_input.dimshuffle(1, 0)[1: ], self.label_input.dimshuffle(1, 0)[1:]], \
         outputs_info = initial, non_sequences = [self.W_sim], strict = True)
        norm = T.sum(theano_logsumexp(outputs[-1], axis = 1))

        f_pot = (f.reshape((-1, f.shape[-1]))[T.arange(f.shape[0] * f.shape[1]), self.label_input.flatten()] * self.mask_input.flatten()).sum()

        labels = self.label_input # batch * time
        shift_labels = T.roll(labels, -1, axis = 1)
        mask = self.mask_input # batch * time
        shift_mask = T.roll(mask, -1, axis = 1)

        g_pot = (self.W_sim[labels.flatten(), shift_labels.flatten()] * mask.flatten() * shift_mask.flatten()).sum()

        return - (f_pot + g_pot - norm) / f.shape[0] if self.normalize else - (f_pot + g_pot - norm)
def WeeklyRandomWalk(name,n,initial,flt=np.array([.05,.1,.7,.1,.05],dtype=np.float64),sigma=.05,offset=0):
    additional_week = np.array([0,1,1,1,1,1,1])[offset%7]
    offset = tt.cast(offset,"int64")
    delay_list_length = n//7+additional_week
    rw_list = []
    rw_list.append(initial)
    sigma_random_walk = pm.HalfNormal(name=name+"_sigma_random_walk", sigma=sigma)
    delay_ratio_random_walk = pm.distributions.timeseries.GaussianRandomWalk(
                              name=name+"_random_walk",mu=0,
                              sigma=sigma_random_walk,shape=delay_list_length,
                              init=pm.Normal.dist(sigma=sigma),
                        )
    flt = tt.cast(flt,np.float64)
    flt = flt / tt.sum(flt)
    val = delay_ratio_random_walk
  
    lval = tt.alloc(0.,val.shape[0]+4)
    lval = tt.cast(lval,"float64")
    lval = tt.set_subtensor(lval[2:-2],val)
    lval = tt.set_subtensor(lval[:2],val[0])
    lval = tt.set_subtensor(lval[-2:],val[-1]) # extend the 

    
    m = tt.alloc(lval,7,lval.shape[0])
    mf = tt.flatten(m.T,ndim=1)
    mf = tt.roll(mf,offset)
    
    mf2 = tt.alloc(mf,1,mf.shape[0])
    kern2 = tt.alloc(flt,1,flt.shape[0])
    
    r = tt.signal.conv.conv2d(mf2,kern2,border_mode='full')
    r = tt.roll(r[0],offset)
    rs = r[(14+flt.shape[0]//2):(7*val.shape[0]+14+flt.shape[0]//2)][:n]
   
    return rw_list[0]+rs
示例#4
0
def ShiftConv(w_t_g, s_t, N):
    shift = 2.*s_t-1.
    Z = T.mod(shift+N, N)
    simj = 1 - (Z - T.floor(Z))
    imj = T.mod(T.arange(N) + T.iround(T.floor(Z)),N)
    w_t_g_roll_1 = T.roll(w_t_g, -T.iround(T.floor(Z)))
    w_t_g_roll_2 = T.roll(w_t_g, -(T.iround(T.floor(Z))+1))
    w_t_s = w_t_g_roll_1*simj + w_t_g_roll_2*(1-simj)
    return w_t_s
def TransferWeekendReported(r_t,f,mask):
    """ Moves f* value at r_t to r_t+2 / r_t+1 on saturdays and sundays """
    sat = r_t * mask[5] * f # Trnasfer cases
    sut = r_t * mask[6] * f
    r_t = r_t - sat - sut   # Substract the transfered cases
    satr = tt.roll(sat,2)   # Shift the transfered cases
    satr = tt.set_subtensor(satr[:2],0)
    sutr = tt.roll(sut,1)
    sutr = tt.set_subtensor(sutr[:1],0)
    r_t = r_t + satr + sutr # Add up
    return r_t
示例#6
0
文件: generate.py 项目: gray0302/lrn2
    def manhatten_corr(self, a, b):
        # [0,0,0,1,1,1,2,2,2]
        i = T.arange(a.shape[2]).repeat(a.shape[3])
        # [1,2,3,1,2,3,1,2,3]
        j = T.tile(T.arange(a.shape[3]), (a.shape[2], ))

        manhatten, _ = theano.scan(lambda i, j: T.sum(
            T.abs_(T.roll(T.roll(a, shift=j, axis=3), shift=i, axis=2) - b)),
                                   sequences=[i, j])

        return T.sum(manhatten)
    def get_output_for(self, input, **kwargs):
        def norm_fn(f, mask, label, previous, W_sim):
            # f: inst * class, mask: inst, previous: inst * class, W_sim: class * class
            next = previous.dimshuffle(0, 1, 'x') + f.dimshuffle(
                0, 'x', 1) + W_sim.dimshuffle('x', 0, 1)
            if COST:
                next = next + COST_CONST * (1.0 - T.extra_ops.to_one_hot(
                    label, self.num_classes).dimshuffle(0, 'x', 1))
            # next: inst * prev * cur
            next = theano_logsumexp(next, axis=1)
            # next: inst * class
            mask = mask.dimshuffle(0, 'x')
            next = previous * (1.0 - mask) + next * mask
            return next

        f = T.dot(input, self.W)
        # f: inst * time * class

        initial = f[:, 0, :]
        if CRF_INIT:
            initial = initial + self.W_init[0].dimshuffle('x', 0)
        if COST:
            initial = initial + COST_CONST * (1.0 - T.extra_ops.to_one_hot(
                self.label_input[:, 0], self.num_classes))
        outputs, _ = theano.scan(fn=norm_fn, \
                                 sequences=[f.dimshuffle(1, 0, 2)[1:], self.mask_input.dimshuffle(1, 0)[1:],
                                            self.label_input.dimshuffle(1, 0)[1:]], \
                                 outputs_info=initial, non_sequences=[self.W_sim], strict=True)
        norm = T.sum(theano_logsumexp(outputs[-1], axis=1))

        f_pot = (f.reshape(
            (-1, f.shape[-1]))[T.arange(f.shape[0] * f.shape[1]),
                               self.label_input.flatten()] *
                 self.mask_input.flatten()).sum()
        if CRF_INIT:
            f_pot += self.W_init[0][self.label_input[:, 0]].sum()

        labels = self.label_input
        # labels: inst * time
        shift_labels = T.roll(labels, -1, axis=1)
        mask = self.mask_input
        # mask : inst * time
        shift_mask = T.roll(mask, -1, axis=1)

        g_pot = (self.W_sim[labels.flatten(),
                            shift_labels.flatten()] * mask.flatten() *
                 shift_mask.flatten()).sum()

        return -(f_pot + g_pot - norm) / f.shape[0]
    def infer(self, keys, key_mask, values, initial_state, target_embedding,
              target_bias, keep_prob):
        def infer_step(y_prev, mask, state, keys, values, key_mask, embedding,
                       embedding_bias):
            return self._infer_step(y_prev, mask, state, keys, values,
                                    key_mask, embedding, embedding_bias,
                                    keep_prob)

        n_steps, batch_size = key_mask.shape
        seq = None
        initial_inputs = T.zeros((batch_size, target_embedding.shape[1]),
                                 "float32")
        initial_mask = T.ones((batch_size, 1), "float32")
        outputs_info = [
            initial_inputs, initial_mask, initial_state, None, None
        ]
        non_seq = [keys, values, key_mask, target_embedding, target_bias]

        # max length is len_src*3
        inputs, mask, states, contexts, probs = ops.scan(infer_step,
                                                         seq,
                                                         outputs_info,
                                                         non_seq,
                                                         n_steps=n_steps * 2)
        mask = T.reshape(mask, mask.shape[:-1])
        mask = T.roll(mask, 1, 0)
        mask = T.set_subtensor(mask[0, :], initial_mask[:, 0])
        # (step, batch, n_voc)->(step*batch, n_voc)
        probs = T.reshape(probs,
                          (probs.shape[0] * probs.shape[1], probs.shape[2]))
        return states, contexts, probs, mask
示例#9
0
def window_batch_timewise(t, b, w, full_index):
    for i in range(w):
        full_index = T.set_subtensor(full_index[i], T.roll(full_index[i], i))
        if i > 0:
            full_index = T.inc_subtensor(
                full_index[i], T.where(full_index[i] > 0, i * t * b - i, 0))
    return full_index
示例#10
0
def hybo_channel(x, p, shift, seed=None, unif=True, just_dropout=False):
    '''Theano hybrid bootstrap backend'''
    if p.get_value() < 0. or p.get_value() > 1:
        raise Exception('Hybrid bootstrap p must be in interval [0, 1].')

    if seed is None:
        seed = np.random.randint(1, 10e6)
        rng = K.RandomStreams(seed=seed)

    if (unif == True):
        retain_prob = 1. - rng.uniform((x.shape[0], ), 0, p, dtype=x.dtype)
        for dim in range(x.ndim - 1):
            retain_prob = K.expand_dims(retain_prob, dim + 1)
    else:
        retain_prob = 1. - p

    mask = rng.binomial((x.shape[0], 1, 1, x.shape[3]),
                        p=retain_prob,
                        dtype=x.dtype)
    mask = T.extra_ops.repeat(mask, x.shape[1], axis=1)
    mask = T.extra_ops.repeat(mask, x.shape[2], axis=2)

    if just_dropout:
        x = x * mask / retain_prob
    else:
        x = x * mask + (1 - mask) * T.roll(x, shift=shift, axis=0)
    return x
示例#11
0
 def getScoreOfPath(self, s, path):
     prevPath = T.roll(path,1)
     prevPath = T.set_subtensor(prevPath[0], -1)
     scores, _ = theano.scan(fn = self.computeScore,
                             sequences = [s, path, prevPath],
                             n_steps = path.shape[0])
     return T.sum(scores)
def WeeklyRandomWalkWeekend(name,n,initial,wfactor,flt=np.array([.05,.1,.7,.1,.05],dtype=np.float64),sigma=.05,offset=0):
    additional_week = np.array([0,1,1,1,1,1,1])[offset%7]   # if firstday == monday, no additional week is needed
    offset = tt.cast(offset,"int64")
    walk_len = n//7+additional_week
    rw_list = []
    rw_list.append(initial)
    # Generate "stepsize"
    sigma_random_walk = pm.HalfNormal(name=name+"_sigma_random_walk", sigma=sigma)
    random_walk = pm.distributions.timeseries.GaussianRandomWalk(
                              name=name+"_random_walk",mu=0,
                              sigma=sigma_random_walk,shape=walk_len,
                              init=pm.Normal.dist(sigma=sigma),
                        )
    flt = flt / tt.sum(flt)
    val = random_walk
  # generates a longer list, 2 at front, two at the back with the same vaule as the original front / back
  # --> 2 weeks pre / post to allow simple filtering and offset of up to one week length eacht.
    lval = tt.alloc(0.,val.shape[0]+4)  # streched list of values
    lval = tt.cast(lval,"float64")
    lval = tt.set_subtensor(lval[2:-2],val)
    lval = tt.set_subtensor(lval[:2],val[0])
    lval = tt.set_subtensor(lval[-2:],val[-1]) # extend the 

    # Generate Matrix 7x(#weeks) shape, which was weekly values dublicated over 7 entries
    m = tt.alloc(lval,7,lval.shape[0])
    mf = tt.flatten(m.T,ndim=1) # Flatten it, now 7 weekly values are 
    
    # Format Matrix 
    mf2 = tt.alloc(mf,1,mf.shape[0])
    kern2 = tt.alloc(flt,1,flt.shape[0])
    
    daily_values = tt.signal.conv.conv2d(mf2,kern2,border_mode='full')
    daily_values = tt.roll(daily_values[0],-offset)
    daily_values_ranged = daily_values[(14+flt.shape[0]//2):(7*val.shape[0]+14+flt.shape[0]//2)][:n]
   
    # Generate 7x(n days) maxtrix marking day of week
    d_oeye = tt.roll(tt.eye(7),-offset,axis=1)
    week_mask = tt.tile(d_oeye,walk_len)[:,:n]
    
    daily_walk = rw_list[0]+daily_values_ranged
    
    # Create Mask with wfactor at the weekends otherwiese 1, then multiply with daily_walk
    weekend_m = week_mask[5] + week_mask[6]   # Saturday + Sunday
    weekend_f = weekend_m*wfactor - weekend_m + tt.ones_like(weekend_m)
    daily_walk = daily_walk * weekend_f
   
    return daily_walk,week_mask
    def evaluate(self, application_call, outputs, mask=None, **kwargs):
        # We assume the data has axes (time, batch, features, ...)
        batch_size = outputs.shape[1]

        # Prepare input for the iterative part
        states = dict_subset(kwargs, self._state_names, must_have=False)
        # masks in context are optional (e.g. `attended_mask`)
        contexts = dict_subset(kwargs, self._context_names, must_have=False)
        feedback = self.readout.feedback(outputs)
        inputs = self.fork.apply(feedback, as_dict=True)

        # Run the recurrent network
        results = self.transition.apply(
            mask=mask, return_initial_states=True, as_dict=True,
            **dict_union(inputs, states, contexts))

        # Separate the deliverables. The last states are discarded: they
        # are not used to predict any output symbol. The initial glimpses
        # are discarded because they are not used for prediction.
        # Remember, glimpses are computed _before_ output stage, states are
        # computed after.
        states = OrderedDict((name, results[name][:-1]) for name in self._state_names)
        glimpses = OrderedDict((name, results[name][1:]) for name in self._glimpse_names)

        # Compute the cost
        feedback = tensor.roll(feedback, 1, 0)
        feedback = tensor.set_subtensor(
            feedback[0],
            self.readout.feedback(self.readout.initial_outputs(batch_size)))

        # Run the language model
        if self.language_model:
            lm_states = self.language_model.evaluate(
                outputs=outputs, mask=mask, as_dict=True)
            lm_states = {'lm_' + name: value for name, value
                         in lm_states.items()}
        else:
            lm_states = {}

        readouts = self.readout.readout(
            feedback=feedback,
            **dict_union(lm_states, states, glimpses, contexts))
        costs = self.readout.cost(readouts, outputs)
        if mask is not None:
            costs *= mask

        for name, variable in list(glimpses.items()) + list(states.items()):
            application_call.add_auxiliary_variable(
                variable.copy(), name=name)

        # This variables can be used to initialize the initial states of the
        # next batch using the last states of the current batch.
        for name in self._state_names + self._glimpse_names:
            application_call.add_auxiliary_variable(
                results[name][-1].copy(), name=name+"_final_value")

        return [costs] + states.values() + glimpses.values()
示例#14
0
 def mask_for_prediction(self, prediction):
     prediction_mask = tensor.lt(
         tensor.cumsum(tensor.eq(prediction, self.eos_label)
                       .astype(theano.config.floatX), axis=0),
         1).astype(theano.config.floatX)
     prediction_mask = tensor.roll(prediction_mask, 1, 0)
     prediction_mask = tensor.set_subtensor(
         prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :]))
     return prediction_mask
示例#15
0
    def cost(self, application_call, outputs, mask=None, **kwargs):
        """Returns generation costs for output sequences.

        Parameters
        ----------
        outputs : :class:`~tensor.TensorVariable`
            The 3(2) dimensional tensor containing output sequences.
            The dimension 0 must stand for time, the dimension 1 for the
            position on the batch.
        mask : :class:`~tensor.TensorVariable`
            The binary matrix identifying fake outputs.

        Notes
        -----
        The contexts are expected as keyword arguments.

        """
        batch_size = outputs.shape[-2]  # TODO Assumes only 1 features dim

        # Prepare input for the iterative part
        states = {
            name: kwargs[name]
            for name in self.state_names if name in kwargs
        }
        contexts = {name: kwargs[name] for name in self.context_names}
        feedback = self.readout.feedback(outputs)
        inputs = (self.fork.apply(feedback, return_dict=True)
                  if self.fork else {
                      'feedback': feedback
                  })

        # Run the recurrent network
        results = self.transition.apply(mask=mask,
                                        return_initial_states=True,
                                        return_dict=True,
                                        **dict_union(inputs, states, contexts))

        # Separate the deliverables
        states = {name: results[name][:-1] for name in self.state_names}
        glimpses = {name: results[name] for name in self.glimpse_names}

        # Compute the cost
        feedback = tensor.roll(feedback, 1, 0)
        feedback = tensor.set_subtensor(
            feedback[0],
            self.readout.feedback(
                self.readout.initial_outputs(batch_size, **contexts)))
        readouts = self.readout.readout(feedback=feedback,
                                        **dict_union(states, glimpses,
                                                     contexts))
        costs = self.readout.cost(readouts, outputs)

        for name, variable in glimpses.items():
            application_call.add_auxiliary_variable(variable.copy(), name=name)

        # In case the user needs some glimpses or states or smth else
        return costs
示例#16
0
    def cost_matrix(self, application_call, outputs, mask=None, **kwargs):
        """Returns generation costs for output sequences.

        See Also
        --------
        :meth:`cost` : Scalar cost.

        """
        # We assume the data has axes (time, batch, features, ...)
        batch_size = outputs.shape[1]

        # Prepare input for the iterative part
        states = dict_subset(kwargs, self._state_names, must_have=False)
        # masks in context are optional (e.g. `attended_mask`)
        #         contexts = dict_subset(kwargs, self._context_names, must_have=False)
        contexts = dict_subset(kwargs, self._context_names, must_have=False)
        contexts['initial_state_context'] = kwargs['initial_state_context']

        feedback = self.readout.feedback(outputs)
        inputs = self.fork.apply(feedback, as_dict=True)

        # Run the recurrent network
        results = self.transition.apply(mask=mask,
                                        return_initial_states=True,
                                        as_dict=True,
                                        **dict_union(inputs, states, contexts))

        # Separate the deliverables. The last states are discarded: they
        # are not used to predict any output symbol. The initial glimpses
        # are discarded because they are not used for prediction.
        # Remember, glimpses are computed _before_ output stage, states are
        # computed after.
        states = {name: results[name][:-1] for name in self._state_names}
        glimpses = {name: results[name][1:] for name in self._glimpse_names}

        # Compute the cost
        feedback = tensor.roll(feedback, 1, 0)
        feedback = tensor.set_subtensor(
            feedback[0],
            self.readout.feedback(self.readout.initial_outputs(batch_size)))
        readouts = self.readout.readout(feedback=feedback,
                                        **dict_union(states, glimpses,
                                                     contexts))
        costs = self.readout.cost(readouts, outputs)
        if mask is not None:
            costs *= mask

        for name, variable in list(glimpses.items()) + list(states.items()):
            application_call.add_auxiliary_variable(variable.copy(), name=name)

        # This variables can be used to initialize the initial states of the
        # next batch using the last states of the current batch.
        for name in self._state_names + self._glimpse_names:
            application_call.add_auxiliary_variable(results[name][-1].copy(),
                                                    name=name + "_final_value")

        return costs
 def new_day(lambda_at_t,imported_at_t,infected,E_t,beta,N):
     f = E_t / N
     new = imported_at_t + theano.dot(infected,beta) * lambda_at_t * f
     new = tt.clip(new,0,N)
  
     infected = tt.roll(infected,1,0)
     infected = tt.set_subtensor(infected[:1],new,inplace=False)
     E_t = tt.clip(E_t-new,0,E_t)
     return new,infected,E_t
示例#18
0
文件: utils.py 项目: harpone/DerpRNN
def roll_and_dot(wvec, xvec):
    """
    wvec.shape = (n_in, )
    xvec.shape = (timesteps, n_in)
    """

    dot = T.dot(xvec, wvec)
    wvec = T.roll(wvec, 1)

    return wvec, dot, xvec
def GenInit(l,a1,a2,t1=10,t2=27,offset=8):
    x = tt.arange(l)
    d1 = tt_lognormal(x,tt.log(t1),.8)*2350 #.4 / 23500
    d2 = tt_lognormal(x,tt.log(t2),.25)*12500
    
    
    din = d1*a1 + d2*a2
    din = tt.roll(din,-offset)
    din = tt.set_subtensor(din[-offset:],0.)
    return din
示例#20
0
def roll_and_dot(wvec, xvec):
    """
    wvec.shape = (n_in, )
    xvec.shape = (timesteps, n_in)
    """

    dot = T.dot(xvec, wvec)
    wvec = T.roll(wvec, 1)

    return wvec, dot, xvec
示例#21
0
    def mse2consist_err(self, y):
        print '=== using mse2consist error. ==='
        # mean square error
        mse = T.mean(T.pow(y - self.y_t, 2))
        # consistency error
        cst_err = T.mean(T.pow(self.y_t - T.roll(self.y_t, shift=1, axis=0),
                               2))

        hybrid_err = 0.9 * mse + 0.1 * cst_err
        return hybrid_err
示例#22
0
def ShiftConv(w_t_g, s_t, N, num_shifts):
    # pad = (num_shifts//2, (num_shifts-1)//2)
    # w_t_g_pd_ = T.concatenate([w_t_g[(-pad[0]-1):-1], w_t_g, w_t_g[:(pad[1])]])
    # w_t_g_pd = w_t_g_pd_.dimshuffle('x','x','x', 0)
    # filter = s_t.dimshuffle('x', 'x', 'x', 0)
    # convolution = T.nnet.conv2d(w_t_g_pd, filter,
    # input_shape=(1, 1, 1, N + pad[0] + pad[1]),
    # filter_shape=(1, 1, 1, num_shifts),
    # subsample=(1, 1),
    # border_mode='valid')
    # w_t_s = convolution[0, 0, 0, :]
    shift = 2.*s_t-1.
    Z = T.mod(shift+N, N)
    simj = 1 - (Z - T.floor(Z))
    imj = T.mod(T.arange(N) + T.iround(T.floor(Z)),N)
    w_t_g_roll_1 = T.roll(w_t_g, -T.iround(T.floor(Z)))
    w_t_g_roll_2 = T.roll(w_t_g, -(T.iround(T.floor(Z))+1))
    w_t_s = w_t_g_roll_1*simj + w_t_g_roll_2*(1-simj)
    return w_t_s
def roll(x, shift, axis):
    """
    A numpy-theano agnostic version of the numpy.roll operator
    calls either numpy.roll or theano.tensor.roll depending on class

    See numpy.roll for usage
    """
    if isinstance(x, np.ndarray):
        return np.roll(x, shift, axis)
    if isinstance(x, T.basic.TensorVariable):
        return T.roll(x, shift, axis)
    raise NotImplementedError()
 def _shift_step(c_mem, c_shift):
     # c_mem is (note, mem)
     # c_shift is an int
     if self.mode=="drop":
         def _clamp_w(x):
             return T.maximum(0,T.minimum(x,self.window_size))
         ins_at_front = T.zeros((_clamp_w(-c_shift),per_note))
         ins_at_back = T.zeros((_clamp_w(c_shift),per_note))
         take_part = c_mem[_clamp_w(c_shift):self.window_size-_clamp_w(-c_shift),:]
         return T.concatenate([ins_at_front, take_part, ins_at_back], 0)
     elif self.mode=="roll":
         return T.roll(c_mem, (-c_shift)%12, axis=0)
    def evolve_system(self, x, n, k, gamma):
        """ Compute time-derivative at current state

        Model: dx/dt = k^n / (x^n + K^n) - gamma*x
        This leads to 3+ species sustained oscillations. Note that x is matrix.

        We have dependency only on preceding variable, which can be efficiently implemented
        by rolling the matrix by `shift=-1` along corresponding axis.
        """
        temp = T.pow(k, n)/(T.pow(x, n)+T.pow(k,n))
        dxdt = T.roll(temp, shift = -1, axis = 1) - gamma*x
        return dxdt
示例#26
0
    def cost_matrix(self, application_call, outputs, mask=None, **kwargs):
        """Returns generation costs for output sequences.

        See Also
        --------
        :meth:`cost` : Scalar cost.

        """
        # We assume the data has axes (time, batch, features, ...)
        batch_size = outputs.shape[1]

        # Prepare input for the iterative part
        states = dict_subset(kwargs, self._state_names, must_have=False)
        # masks in context are optional (e.g. `attended_mask`)
        contexts = dict_subset(kwargs, self._context_names, must_have=False)
        feedback = self.readout.feedback(outputs)
        inputs = self.fork.apply(feedback, as_dict=True)

        # Run the recurrent network
        results = self.transition.apply(
            mask=mask, return_initial_states=True, as_dict=True,
            **dict_union(inputs, states, contexts))

        # Separate the deliverables. The last states are discarded: they
        # are not used to predict any output symbol. The initial glimpses
        # are discarded because they are not used for prediction.
        # Remember, glimpses are computed _before_ output stage, states are
        # computed after.
        states = {name: results[name][:-1] for name in self._state_names}
        glimpses = {name: results[name][1:] for name in self._glimpse_names}

        # Compute the cost
        feedback = tensor.roll(feedback, 1, 0)
        feedback = tensor.set_subtensor(
            feedback[0],
            self.readout.feedback(self.readout.initial_outputs(batch_size)))
        readouts = self.readout.readout(
            feedback=feedback, **dict_union(states, glimpses, contexts))
        costs = self.readout.cost(readouts, outputs)
        if mask is not None:
            costs *= mask

        for name, variable in list(glimpses.items()) + list(states.items()):
            application_call.add_auxiliary_variable(
                variable.copy(), name=name)

        # This variables can be used to initialize the initial states of the
        # next batch using the last states of the current batch.
        for name in self._state_names:
            application_call.add_auxiliary_variable(
                results[name][-1].copy(), name=name+"_final_value")

        return costs
示例#27
0
    def evolve_system(self, x, n, k, gamma):
        """ Compute time-derivative at current state

        Model: dx/dt = k^n / (x^n + K^n) - gamma*x
        This leads to 3+ species sustained oscillations. Note that x is matrix.

        We have dependency only on preceding variable, which can be efficiently implemented
        by rolling the matrix by `shift=-1` along corresponding axis.
        """
        temp = T.pow(k, n)/(T.pow(x, n)+T.pow(k,n))
        dxdt = T.roll(temp, shift = -1, axis = 1) - gamma*x
        return dxdt
def roll(x, shift, axis):
    """
    A numpy-theano agnostic version of the numpy.roll operator
    calls either numpy.roll or theano.tensor.roll depending on class

    See numpy.roll for usage
    """
    if isinstance(x, np.ndarray):
        return np.roll(x, shift, axis)
    if isinstance(x, T.basic.TensorVariable):
        return T.roll(x, shift, axis)
    raise NotImplementedError()
示例#29
0
    def __init__(self, input, We, features, longest):
        """
        Input = a list (minibatch) of lists of indexes (pre-processed sentences)
        We = a word embedding matrix (vocabulary * dimensions)
        """

        # initialise the word embeddings
        self.We = theano.shared(numpy.asarray(We, dtype=theano.config.floatX),
                                name='We',
                                borrow=True)
        # Mapping to vector:
        # from: input (batch_size * indices)
        # to: vectors (batch_size * indices * dimensions)
        lookup = self.We[input]
        # This step concatenates along the vector-dimension axis three versions of the lookup 3D tensor:
        # 1) lookup 'rolled' forwards by 1 along the indices axis, 2) original tesor 3) tensor shifted backwards by 1
        # Note that in 1) and 3) a 0-valued vector represent sentence boundaries.
        forwards = T.set_subtensor(T.roll(lookup, 1, axis=1)[:, 0], 0.)
        backwards = T.set_subtensor(T.roll(lookup, -1, axis=1)[:, -1], 0.)
        window_processing = T.concatenate([forwards, lookup, backwards],
                                          axis=2)

        event1 = self.We[features[:, 0]]
        event2 = self.We[features[:, 1]]
        participants1 = T.max(self.We[features[:, 2:5]], axis=1)
        participants2 = T.max(self.We[features[:, 5:8]], axis=1)
        # Lexical features n_examples * dimensions
        lex = T.concatenate([event1, event2, participants1, participants2],
                            axis=1)

        positions1 = features[:, 8:8 + longest, numpy.newaxis] / 100.
        positions2 = features[:, 8 + longest:, numpy.newaxis] / 100.
        senpos = T.concatenate([window_processing, positions1, positions2],
                               axis=2)

        # I/O
        self.input = input
        self.output = [senpos, lex]
        # parameters of the model
        self.params = [self.We]
示例#30
0
def lanczos(linear_op, z, m, batch_size):
    s = z.norm(2, axis=1)
    v = z / s.dimshuffle(0, 'x')

    alpha = []
    beta = []
    V = []
    V.append(v)
    v_curr = v
    b = None
    v_prev = None

    for j in xrange(m):
        if j == 0:
            r = linear_op(v_curr)
        else:
            r = linear_op(v_curr) - b.dimshuffle(0, 'x') * v_prev
        a = T.batched_dot(v_curr, r)
        r = r - a.dimshuffle(0, 'x') * v_curr
        b = r.norm(2, axis=1)
        v_prev = v_curr
        v_curr = r / b.dimshuffle(0, 'x')
        alpha.append(a)
        if j < m - 1:
            V.append(v_curr)
            beta.append(b)

    Az_list = []
    for idx in xrange(batch_size):
        alpha_diag = T.diag(T.stacklists([a_[idx] for a_ in alpha]))
        beta_diag = T.diag(T.stacklists([b_[idx] for b_ in beta] + [0]))
        M = alpha_diag + T.roll(beta_diag, 1, 0) + T.roll(beta_diag, 1, 1)
        V_matrix = T.stacklists([v_[idx] for v_ in V]).T
        approx_sqrt = s[idx] * V_matrix.dot(theano_sqrtm(M)[:, 0])
        Az_list.append(approx_sqrt)

    Azs = T.stacklists(Az_list)

    return Azs
示例#31
0
    def cost(self, outputs, mask=None, **kwargs):
        """Returns generation costs for output sequences.

        Parameters
        ----------
        outputs : Theano variable
            The 3(2) dimensional tensor containing output sequences.
            The dimension 0 must stand for time, the dimension 1 for the
            position on the batch.
        mask : The 0/1 matrix identifying fake outputs.

        Notes
        -----
        The contexts are expected as keyword arguments.

        """
        batch_size = outputs.shape[-2]  # TODO Assumes only 1 features dim

        # Prepare input for the iterative part
        states = {name: kwargs[name] for name in self.state_names
                  if name in kwargs}
        contexts = {name: kwargs[name] for name in self.context_names}
        feedback = self.readout.feedback(outputs)
        inputs = (self.fork.apply(feedback, return_dict=True)
                  if self.fork else {'feedback': feedback})

        # Run the recurrent network
        results = self.transition.apply(
            mask=mask, return_initial_states=True, return_dict=True,
            **dict_union(inputs, states, contexts))

        # Separate the deliverables
        states = {name: results[name][:-1] for name in self.state_names}
        glimpses = {name: results[name] for name in self.glimpse_names}

        # Compute the cost
        feedback = tensor.roll(feedback, 1, 0)
        feedback = tensor.set_subtensor(
            feedback[0],
            self.readout.feedback(self.readout.initial_outputs(
                batch_size, **contexts)))
        readouts = self.readout.readout(
            feedback=feedback, **dict_union(states, glimpses, contexts))
        costs = self.readout.cost(readouts, outputs)

        # In case the user needs some glimpses or states or smth else
        also_return = kwargs.get("also_return")
        if also_return:
            others = {name: results[name] for name in also_return}
            return (costs, others)
        return costs
示例#32
0
    def get_probs(self):
        t = self.temperatures
        t_term = (1. / t - T.roll(1. / t, shift=-1))
        t_term = T.set_subtensor(t_term[-1], 0)

        e_term = self.energy_(self.pps) - T.roll(self.energy_(self.pps),
                                                 shift=-1)
        e_term = T.set_subtensor(e_term[-1], 0.)
        probs = T.exp(t_term * e_term)

        actions = T.cast(T.gt(probs, self.t_rng.uniform((probs.shape))), fx)

        add = T.concatenate([[np.cast[fx](0.)], actions])

        add = T.roll(add, shift=-1) - add
        add = add[:-1]
        add = T.switch(T.gt(add, 0), 1., 0.)
        add = T.set_subtensor(add[-1], 0.)
        add = add - T.roll(add, shift=1)
        idx = T.arange(actions.shape[0], dtype=fx)
        idx = idx + add

        return self.energy_(self.pps)
 def mask_for_prediction(self, prediction, groundtruth_mask=None,
                         extra_generation_steps=None):
     prediction_mask = tensor.lt(
         tensor.cumsum(tensor.eq(prediction, self.eos_label)
                       .astype(theano.config.floatX), axis=0),
         1).astype(theano.config.floatX)
     prediction_mask = tensor.roll(prediction_mask, 1, 0)
     prediction_mask = tensor.set_subtensor(
         prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :]))
     if groundtruth_mask:
         max_lengths = groundtruth_mask.sum(axis=0) + extra_generation_steps
         prediction_mask *= tensor.lt(
             tensor.arange(prediction.shape[0])[:, None], max_lengths[None, :])
     return prediction_mask
示例#34
0
 def chunk_grad(i):
     ''' operates on a subset of the gradient variables '''
     wrt_rep = tt.tile(wrt, (chunk_size, 1))
     if func is not None:
         expr_rep = func(wrt_rep)
     else:
         expr_rep, _ = theano.scan(
             fn=lambda wrt_: theano.clone(expr, {wrt: wrt_}),
             sequences=wrt_rep)
     chunk_expr_grad = tt.roll(tt.identity_like(expr_rep),
                               i * chunk_size,
                               axis=1)
     return tt.grad(cost=None,
                    wrt=wrt_rep,
                    known_grads={expr_rep: chunk_expr_grad})
示例#35
0
            def _shift_step(c_mem, c_shift):
                # c_mem is (note, mem)
                # c_shift is an int
                if self.mode == "drop":

                    def _clamp_w(x):
                        return T.maximum(0, T.minimum(x, self.window_size))

                    ins_at_front = T.zeros((_clamp_w(-c_shift), per_note))
                    ins_at_back = T.zeros((_clamp_w(c_shift), per_note))
                    take_part = c_mem[_clamp_w(c_shift):self.window_size -
                                      _clamp_w(-c_shift), :]
                    return T.concatenate(
                        [ins_at_front, take_part, ins_at_back], 0)
                elif self.mode == "roll":
                    return T.roll(c_mem, (-c_shift) % 12, axis=0)
示例#36
0
        def _scan_fn(cprobs, cpos):

            if self.with_artic:
                abs_probs = cprobs[:2]
                rel_probs = cprobs[2:]
            else:
                rel_probs = cprobs
                abs_probs = T.ones((2,))

            aligned = T.roll(rel_probs, (cpos-low_bound)%12)

            num_tile = int(math.ceil((high_bound-low_bound)/self.WINDOW_SIZE))

            tiled = T.tile(aligned, (num_tile,))[:(high_bound-low_bound)]

            full = T.concatenate([abs_probs, tiled], 0)
            return full
示例#37
0
def l2_paired(x):
    """Spectral smoothing
    Applies a modified L2 norm to a 1D vector that takes 
    into account the locality of the information
    Parameters
    ----------
    x : theano tensor 
        The input tensor.
    Returns
    -------
    theano tensor
        The output tensor
  """
    shapes = x.shape.eval()
    mask = np.eye(shapes[-1])
    mask[-1, -1] = 0
    rolled = T.roll(x, -1, axis=len(shapes) - 1)
    return T.sum((x - T.dot(rolled, mask))**2)
    def get_output_for(self, inputs, **kwargs):
        '''

        Parameters
        ------------------------------
        inputs: two 5d tensors, [kspace_data, mask], each of shape (n, 2, nx, ny, nt)

        Returns
        ------------------------------
        output: 5d tensor, missing lines of k-space are filled using neighbouring frames.
        shape becomes (n* (len(frame_dist), 2, nx, ny, nt)
        '''
        x = inputs[0]
        mask = inputs[1]

        result, _ = theano.scan(fn=roll_and_sum,
                                outputs_info=T.zeros_like(x),
                                non_sequences=(x),
                                n_steps=T.constant(np.max(self.n_samples)))

        mask_result, _ = theano.scan(fn=roll_and_sum,
                                     outputs_info=T.zeros_like(x),
                                     non_sequences=(mask),
                                     n_steps=T.constant(np.max(
                                         self.n_samples)))

        results = [x]
        for i, t in enumerate(self.n_samples):
            # divide unbiasedly
            if self.divide_by_n:
                c = float(t)
            else:
                c = 1.0

            acc = result[t - 1]
            mask_acc = mask_result[t - 1]
            # when rolling back, need extra 1 because roll_and_sum rolls after adding a val.
            avg = T.roll(acc / T.maximum(c, mask_acc),
                         -self.frame_dist[i] - 1,
                         axis=-1)
            res = avg * (1 - mask) + x * mask
            results.append(res)

        return T.concatenate(results, axis=1)  # concatenate along channels
示例#39
0
    def __init__(self, input, n_in):
        delta = 0.01
        self.n_in = n_in
        self.input = input
        self.name = "VL" + str(n_in)
        self.A = theano.shared((np.random.uniform(-1,1,(n_in+1,n_in))*delta).astype(T.config.floatX))
        
        indices = T.ivector('indices')
        prevIndices = T.roll(indices,1,axis=0)
        prevIndices = T.set_subtensor(prevIndices[0],n_in)

        self.params = [self.A]
        
        scores, _ = theano.scan(fn = self.score,
                                    sequences = [input, indices,prevIndices],
                                    n_steps = input.shape[0])

        score = T.sum(scores)
        self.f_score = theano.function([self.input, indices], score)

        initScore = self.A[n_in] + input[0]
        
        [bestScore,bestIndex], _ = theano.scan(fn = self.viterbi,
                                   outputs_info=[initScore,None],
                                   sequences = input[1:],
                                   n_steps=input.shape[0]-1)
        last = T.argmax(bestScore[-1])
        
        bestPath, _ = theano.scan(fn = self.findPath,
                              outputs_info=[last],
                              sequences = bestIndex,
                              go_backwards=True,
                              n_steps=bestIndex.shape[0])
        
        

        self.path = T.concatenate(([last],bestPath))

        self.output = T.max(bestScore,axis=0)
        self.predict = self.path
        # self.predict = theano.function([input], self.path) 
        # self.output = theano.function([input], bestScore[-1])
        self.updates = None
    def get_output_for(self, inputs, **kwargs):
        '''

        Parameters
        ------------------------------
        inputs: two 5d tensors, [kspace_data, mask], each of shape (n, 2, nx, ny, nt)

        Returns
        ------------------------------
        output: 5d tensor, missing lines of k-space are filled using neighbouring frames.
        shape becomes (n* (len(frame_dist), 2, nx, ny, nt)
        '''
        x = inputs[0]
        mask = inputs[1]

        result, _ = theano.scan(fn=roll_and_sum,
                                outputs_info=T.zeros_like(x),
                                non_sequences=(x),
                                n_steps=T.constant(np.max(self.n_samples)))

        mask_result, _ = theano.scan(fn=roll_and_sum,
                                     outputs_info=T.zeros_like(x),
                                     non_sequences=(mask),
                                     n_steps=T.constant(np.max(self.n_samples)))

        results = [x]
        for i, t in enumerate(self.n_samples):
            # divide unbiasedly
            if self.divide_by_n:
                c = float(t)
            else:
                c = 1.0

            acc = result[t-1]
            mask_acc = mask_result[t-1]
            # when rolling back, need extra 1 because roll_and_sum rolls after adding a val.
            avg = T.roll(acc / T.maximum(c, mask_acc),
                         -self.frame_dist[i]-1,
                         axis=-1)
            res = avg * (1-mask) + x * mask
            results.append(res)

        return T.concatenate(results, axis=1)  # concatenate along channels
示例#41
0
def add_exploration(recognizer, data, train_conf):

    prediction = None
    prediction_mask = None
    explore_conf = train_conf.get('exploration', 'imitative')
    if explore_conf in ['greedy', 'mixed']:
        length_expand = 10
        prediction = recognizer.get_generate_graph(
            n_steps=recognizer.labels.shape[0] + length_expand)['outputs']
        prediction_mask = tensor.lt(
            tensor.cumsum(tensor.eq(prediction, data.eos_label), axis=0),
            1).astype(floatX)
        prediction_mask = tensor.roll(prediction_mask, 1, 0)
        prediction_mask = tensor.set_subtensor(
            prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :]))

        if explore_conf == 'mixed':
            batch_size = recognizer.labels.shape[1]
            targets = tensor.concatenate([
                recognizer.labels,
                tensor.zeros((length_expand, batch_size), dtype='int64')
            ])

            targets_mask = tensor.concatenate([
                recognizer.labels_mask,
                tensor.zeros((length_expand, batch_size), dtype=floatX)
            ])
            rng = MRG_RandomStreams()
            generate = rng.binomial((batch_size, ), p=0.5, dtype='int64')
            prediction = (generate[None, :] * prediction +
                          (1 - generate[None, :]) * targets)
            prediction_mask = (
                tensor.cast(generate[None, :] * prediction_mask, floatX) +
                tensor.cast((1 - generate[None, :]) * targets_mask, floatX))

        prediction_mask = theano.gradient.disconnected_grad(prediction_mask)
    elif explore_conf != 'imitative':
        raise ValueError

    return prediction, prediction_mask
示例#42
0
def add_exploration(recognizer, data, train_conf):

    prediction = None
    prediction_mask = None
    explore_conf = train_conf.get('exploration', 'imitative')
    if explore_conf in ['greedy', 'mixed']:
        length_expand = 10
        prediction = recognizer.get_generate_graph(
            n_steps=recognizer.labels.shape[0] + length_expand)['outputs']
        prediction_mask = tensor.lt(
            tensor.cumsum(tensor.eq(prediction, data.eos_label), axis=0),
            1).astype(floatX)
        prediction_mask = tensor.roll(prediction_mask, 1, 0)
        prediction_mask = tensor.set_subtensor(
            prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :]))

        if explore_conf == 'mixed':
            batch_size = recognizer.labels.shape[1]
            targets = tensor.concatenate([
                recognizer.labels,
                tensor.zeros((length_expand, batch_size), dtype='int64')])

            targets_mask = tensor.concatenate([
                recognizer.labels_mask,
                tensor.zeros((length_expand, batch_size), dtype=floatX)])
            rng = MRG_RandomStreams()
            generate = rng.binomial((batch_size,), p=0.5, dtype='int64')
            prediction = (generate[None, :] * prediction +
                          (1 - generate[None, :]) * targets)
            prediction_mask = (tensor.cast(generate[None, :] *
                                           prediction_mask, floatX) +
                               tensor.cast((1 - generate[None, :]) *
                                           targets_mask, floatX))

        prediction_mask = theano.gradient.disconnected_grad(prediction_mask)
    elif explore_conf != 'imitative':
        raise ValueError

    return prediction, prediction_mask
示例#43
0
    def get_output(self, train):
        shift = self.shift
        axis = self.axis

        x = self.get_input(train)
        return T.roll(x, shift, axis=axis)
示例#44
0
  def __init__(self,
               n_out = None,
               n_units = None,
               direction = 1,
               truncation = -1,
               sampling = 1,
               encoder = None,
               unit = 'lstm',
               n_dec = 0,
               attention = "none",
               recurrent_transform = "none",
               recurrent_transform_attribs = "{}",
               attention_template = 128,
               attention_distance = 'l2',
               attention_step = "linear",
               attention_beam = 0,
               attention_norm = "exp",
               attention_momentum = "none",
               attention_sharpening = 1.0,
               attention_nbest = 0,
               attention_store = False,
               attention_smooth = False,
               attention_glimpse = 1,
               attention_filters = 1,
               attention_accumulator = 'sum',
               attention_loss = 0,
               attention_bn = 0,
               attention_lm = 'none',
               attention_ndec = 1,
               attention_memory = 0,
               attention_alnpts = 0,
               attention_epoch  = 1,
               attention_segstep=0.01,
               attention_offset=0.95,
               attention_method="epoch",
               attention_scale=10,
               context=-1,
               base = None,
               aligner = None,
               lm = False,
               force_lm = False,
               droplm = 1.0,
               forward_weights_init=None,
               bias_random_init_forget_shift=0.0,
               copy_weights_from_base=False,
               segment_input=False,
               join_states=False,
               sample_segment=None,
               **kwargs):
    """
    :param n_out: number of cells
    :param n_units: used when initialized via Network.from_hdf_model_topology
    :param direction: process sequence in forward (1) or backward (-1) direction
    :param truncation: gradient truncation
    :param sampling: scan every nth frame only
    :param encoder: list of encoder layers used as initalization for the hidden state
    :param unit: cell type (one of 'lstm', 'vanilla', 'gru', 'sru')
    :param n_dec: absolute number of steps to unfold the network if integer, else relative number of steps from encoder
    :param recurrent_transform: name of recurrent transform
    :param recurrent_transform_attribs: dictionary containing parameters for a recurrent transform
    :param attention_template:
    :param attention_distance:
    :param attention_step:
    :param attention_beam:
    :param attention_norm:
    :param attention_sharpening:
    :param attention_nbest:
    :param attention_store:
    :param attention_align:
    :param attention_glimpse:
    :param attention_lm:
    :param base: list of layers which outputs are considered as based during attention mechanisms
    :param lm: activate RNNLM
    :param force_lm: expect previous labels to be given during testing
    :param droplm: probability to take the expected output as predecessor instead of the real one when LM=true
    :param bias_random_init_forget_shift: initialize forget gate bias of lstm networks with this value
    """
    source_index = None
    if len(kwargs['sources']) == 1 and (kwargs['sources'][0].layer_class.endswith('length') or kwargs['sources'][0].layer_class.startswith('length')):
      kwargs['sources'] = []
      source_index = kwargs['index']
    unit_given = unit
    from Device import is_using_gpu
    if unit == 'lstm':  # auto selection
      if not is_using_gpu():
        unit = 'lstme'
      elif recurrent_transform == 'none' and (not lm or droplm == 0.0):
        unit = 'lstmp'
      else:
        unit = 'lstmc'
    elif unit in ("lstmc", "lstmp") and not is_using_gpu():
      unit = "lstme"
    if segment_input:
      if is_using_gpu():
        unit = "lstmps"
      else:
        unit = "lstms"
    if n_out is None:
      assert encoder
      n_out = sum([enc.attrs['n_out'] for enc in encoder])
    kwargs.setdefault("n_out", n_out)
    if n_units is not None:
      assert n_units == n_out
    self.attention_weight = T.constant(1.,'float32')
    if len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('length'):
      kwargs['sources'] = []
    elif len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('signal'):
      kwargs['sources'] = []
    super(RecurrentUnitLayer, self).__init__(**kwargs)
    self.set_attr('from', ",".join([s.name for s in self.sources]) if self.sources else "null")
    self.set_attr('n_out', n_out)
    self.set_attr('unit', unit_given.encode("utf8"))
    self.set_attr('truncation', truncation)
    self.set_attr('sampling', sampling)
    self.set_attr('direction', direction)
    self.set_attr('lm', lm)
    self.set_attr('force_lm', force_lm)
    self.set_attr('droplm', droplm)
    if bias_random_init_forget_shift:
      self.set_attr("bias_random_init_forget_shift", bias_random_init_forget_shift)
    self.set_attr('attention_beam', attention_beam)
    self.set_attr('recurrent_transform', recurrent_transform.encode("utf8"))
    if isinstance(recurrent_transform_attribs, str):
      recurrent_transform_attribs = json.loads(recurrent_transform_attribs)
    if attention_template is not None:
      self.set_attr('attention_template', attention_template)
    self.set_attr('recurrent_transform_attribs', recurrent_transform_attribs)
    self.set_attr('attention_distance', attention_distance.encode("utf8"))
    self.set_attr('attention_step', attention_step.encode("utf8"))
    self.set_attr('attention_norm', attention_norm.encode("utf8"))
    self.set_attr('attention_sharpening', attention_sharpening)
    self.set_attr('attention_nbest', attention_nbest)
    attention_store = attention_store or attention_smooth or attention_momentum != 'none'
    self.set_attr('attention_store', attention_store)
    self.set_attr('attention_smooth', attention_smooth)
    self.set_attr('attention_momentum', attention_momentum.encode('utf8'))
    self.set_attr('attention_glimpse', attention_glimpse)
    self.set_attr('attention_filters', attention_filters)
    self.set_attr('attention_lm', attention_lm)
    self.set_attr('attention_bn', attention_bn)
    self.set_attr('attention_accumulator', attention_accumulator)
    self.set_attr('attention_ndec', attention_ndec)
    self.set_attr('attention_memory', attention_memory)
    self.set_attr('attention_loss', attention_loss)
    self.set_attr('n_dec', n_dec)
    self.set_attr('segment_input', segment_input)
    self.set_attr('attention_alnpts', attention_alnpts)
    self.set_attr('attention_epoch', attention_epoch)
    self.set_attr('attention_segstep', attention_segstep)
    self.set_attr('attention_offset', attention_offset)
    self.set_attr('attention_method', attention_method)
    self.set_attr('attention_scale', attention_scale)
    if segment_input:
      if not self.eval_flag:
      #if self.eval_flag:
        if isinstance(self.sources[0],RecurrentUnitLayer):
          self.inv_att = self.sources[0].inv_att #NBT
        else:
          if not join_states:
            self.inv_att = self.sources[0].attention #NBT
          else:
            assert hasattr(self.sources[0], "nstates"), "source does not have number of states!"
            ns = self.sources[0].nstates
            self.inv_att = self.sources[0].attention[(ns-1)::ns]
        inv_att = T.roll(self.inv_att.dimshuffle(2, 1, 0),1,axis=0)#TBN
        inv_att = T.set_subtensor(inv_att[0],T.zeros((inv_att.shape[1],inv_att.shape[2])))
        inv_att = T.max(inv_att,axis=-1)
      else:
        inv_att = T.zeros((self.sources[0].output.shape[0],self.sources[0].output.shape[1]))
    if encoder and hasattr(encoder[0],'act'):
      self.set_attr('encoder', ",".join([e.name for e in encoder]))
    if base:
      self.set_attr('base', ",".join([b.name for b in base]))
    else:
      base = encoder
    self.base = base
    self.encoder = encoder
    if aligner:
      self.aligner = aligner
    self.set_attr('n_units', n_out)
    unit = eval(unit.upper())(**self.attrs)
    assert isinstance(unit, Unit)
    self.unit = unit
    kwargs.setdefault("n_out", unit.n_out)
    n_out = unit.n_out
    self.set_attr('n_out', unit.n_out)
    if n_dec < 0:
      source_index = self.index
      n_dec *= -1
    if n_dec != 0:
      self.target_index = self.index
      if isinstance(n_dec,float):
        if not source_index:
          source_index = encoder[0].index if encoder else base[0].index
        lengths = T.cast(T.ceil(T.sum(T.cast(source_index,'float32'),axis=0) * n_dec), 'int32')
        idx, _ = theano.map(lambda l_i, l_m:T.concatenate([T.ones((l_i,),'int8'),T.zeros((l_m-l_i,),'int8')]),
                            [lengths], [T.max(lengths)+1])
        self.index = idx.dimshuffle(1,0)[:-1]
        n_dec = T.cast(T.ceil(T.cast(source_index.shape[0],'float32') * numpy.float32(n_dec)),'int32')
      else:
        if encoder:
          self.index = encoder[0].index
        self.index = T.ones((n_dec,self.index.shape[1]),'int8')
    else:
      n_dec = self.index.shape[0]
    # initialize recurrent weights
    self.W_re = None
    if unit.n_re > 0:
      self.W_re = self.add_param(self.create_recurrent_weights(unit.n_units, unit.n_re, name="W_re_%s" % self.name))
    # initialize forward weights
    bias_init_value = self.create_bias(unit.n_in).get_value()
    if bias_random_init_forget_shift:
      assert unit.n_units * 4 == unit.n_in  # (input gate, forget gate, output gate, net input)
      bias_init_value[unit.n_units:2 * unit.n_units] += bias_random_init_forget_shift
    self.b.set_value(bias_init_value)
    if not forward_weights_init:
      forward_weights_init = "random_uniform(p_add=%i)" % unit.n_re
    else:
      self.set_attr('forward_weights_init', forward_weights_init)
    self.forward_weights_init = forward_weights_init
    self.W_in = []
    sample_mean, gamma = None, None
    if copy_weights_from_base:
      self.params = {}
      #self.W_re = self.add_param(base[0].W_re)
      #self.W_in = [ self.add_param(W) for W in base[0].W_in ]
      #self.b = self.add_param(base[0].b)
      self.W_re = base[0].W_re
      self.W_in = base[0].W_in
      self.b = base[0].b
      if self.attrs.get('batch_norm', False):
        sample_mean = base[0].sample_mean
        gamma = base[0].gamma
      #self.masks = base[0].masks
      #self.mass = base[0].mass
    else:
      for s in self.sources:
        W = self.create_forward_weights(s.attrs['n_out'], unit.n_in, name="W_in_%s_%s" % (s.name, self.name))
        self.W_in.append(self.add_param(W))
    # make input
    z = self.b
    for x_t, m, W in zip(self.sources, self.masks, self.W_in):
      if x_t.attrs['sparse']:
        if x_t.output.ndim == 3: out_dim = x_t.output.shape[2]
        elif x_t.output.ndim == 2: out_dim = 1
        else: assert False, x_t.output.ndim
        if x_t.output.ndim == 3:
          z += W[T.cast(x_t.output[:,:,0], 'int32')]
        elif x_t.output.ndim == 2:
          z += W[T.cast(x_t.output, 'int32')]
        else:
          assert False, x_t.output.ndim
      elif m is None:
        z += T.dot(x_t.output, W)
      else:
        z += self.dot(self.mass * m * x_t.output, W)
    #if self.attrs['batch_norm']:
    #  z = self.batch_norm(z, unit.n_in)
    num_batches = self.index.shape[1]
    self.num_batches = num_batches
    non_sequences = []
    if self.attrs['lm'] or attention_lm != 'none':
      if not 'target' in self.attrs:
        self.attrs['target'] = 'classes'
      if self.attrs['droplm'] > 0.0 or not (self.train_flag or force_lm):
        if copy_weights_from_base:
          self.W_lm_in = base[0].W_lm_in
          self.b_lm_in = base[0].b_lm_in
        else:
          l = sqrt(6.) / sqrt(unit.n_out + self.y_in[self.attrs['target']].n_out)
          values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(unit.n_out, self.y_in[self.attrs['target']].n_out)), dtype=theano.config.floatX)
          self.W_lm_in = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_in_"+self.name))
          self.b_lm_in = self.create_bias(self.y_in[self.attrs['target']].n_out, 'b_lm_in')
      l = sqrt(6.) / sqrt(unit.n_in + self.y_in[self.attrs['target']].n_out)
      values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(self.y_in[self.attrs['target']].n_out, unit.n_in)), dtype=theano.config.floatX)
      if copy_weights_from_base:
        self.W_lm_out = base[0].W_lm_out
      else:
        self.W_lm_out = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_out_"+self.name))
      if self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm):
        self.lmmask = 1
        #if recurrent_transform != 'none':
        #  recurrent_transform = recurrent_transform[:-3]
      elif self.attrs['droplm'] < 1.0 and (self.train_flag or force_lm):
        from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
        srng = RandomStreams(self.rng.randint(1234) + 1)
        self.lmmask = T.cast(srng.binomial(n=1, p=1.0 - self.attrs['droplm'], size=self.index.shape), theano.config.floatX).dimshuffle(0,1,'x').repeat(unit.n_in,axis=2)
      else:
        self.lmmask = T.zeros_like(self.index, dtype='float32').dimshuffle(0,1,'x').repeat(unit.n_in,axis=2)

    if recurrent_transform == 'input': # attention is just a sequence dependent bias (lstmp compatible)
      src = []
      src_names = []
      n_in = 0
      for e in base:
        #src_base = [ s for s in e.sources if s.name not in src_names ]
        #src_names += [ s.name for s in e.sources ]
        src_base = [ e ]
        src_names += [e.name]
        src += [s.output for s in src_base]
        n_in += sum([s.attrs['n_out'] for s in src_base])
      self.xc = T.concatenate(src, axis=2)
      l = sqrt(6.) / sqrt(self.attrs['n_out'] + n_in)
      values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, 1)), dtype=theano.config.floatX)
      self.W_att_xc = self.add_param(self.shared(value=values, borrow=True, name = "W_att_xc"))
      values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, self.attrs['n_out'] * 4)), dtype=theano.config.floatX)
      self.W_att_in = self.add_param(self.shared(value=values, borrow=True, name = "W_att_in"))
      zz = T.exp(T.tanh(T.dot(self.xc, self.W_att_xc))) # TB1
      self.zc = T.dot(T.sum(self.xc * (zz / T.sum(zz, axis=0, keepdims=True)).repeat(self.xc.shape[2],axis=2), axis=0, keepdims=True), self.W_att_in)
      recurrent_transform = 'none'
    elif recurrent_transform == 'attention_align':
      max_skip = base[0].attrs['max_skip']
      values = numpy.zeros((max_skip,), dtype=theano.config.floatX)
      self.T_b = self.add_param(self.shared(value=values, borrow=True, name="T_b"), name="T_b")
      l = sqrt(6.) / sqrt(self.attrs['n_out'] + max_skip)
      values = numpy.asarray(self.rng.uniform(
        low=-l, high=l, size=(self.attrs['n_out'], max_skip)), dtype=theano.config.floatX)
      self.T_W = self.add_param(self.shared(value=values, borrow=True, name="T_W"), name="T_W")
      y_t = T.dot(self.base[0].attention, T.arange(self.base[0].output.shape[0], dtype='float32'))  # NB
      y_t = T.concatenate([T.zeros_like(y_t[:1]), y_t], axis=0)  # (N+1)B
      y_t = y_t[1:] - y_t[:-1]  # NB
      self.y_t = y_t # T.clip(y_t,numpy.float32(0),numpy.float32(max_skip - 1))

      self.y_t = T.cast(self.base[0].backtrace,'float32')
    elif recurrent_transform == 'attention_segment':
      assert aligner.attention, "Segment-wise attention requires attention points!"

    recurrent_transform_inst = RecurrentTransform.transform_classes[recurrent_transform](layer=self)
    assert isinstance(recurrent_transform_inst, RecurrentTransform.RecurrentTransformBase)
    unit.recurrent_transform = recurrent_transform_inst
    self.recurrent_transform = recurrent_transform_inst
    # scan over sequence
    for s in range(self.attrs['sampling']):
      index = self.index[s::self.attrs['sampling']]

      if context > 0:
        from TheanoUtil import context_batched
        n_batches = z.shape[1]
        time, batch, dim = z.shape[0], z.shape[1], z.shape[2]
        #z = context_batched(z[::direction or 1], window=context)[::direction or 1] # TB(CD)

        from theano.ifelse import ifelse
        def context_window(idx, x_in, i_in):
          x_out = x_in[idx:idx + context]
          x_out = x_out.dimshuffle('x',1,0,2).reshape((1, batch, dim * context))
          i_out = i_in[idx:idx+1].repeat(context, axis=0)
          i_out = ifelse(T.lt(idx,context),T.set_subtensor(i_out[:context - idx],numpy.int8(0)),i_out).reshape((1, batch * context))
          return x_out, i_out

        z = z[::direction or 1]
        i = index[::direction or 1]
        out, _ = theano.map(context_window, sequences = [T.arange(z.shape[0])], non_sequences = [T.concatenate([T.zeros((context - 1,z.shape[1],z.shape[2]),dtype='float32'),z],axis=0), i])
        z = out[0][::direction or 1]
        i = out[1][::direction or 1] # T(BC)
        direction = 1
        z = z.reshape((time * batch, context * dim)) # (TB)(CD)
        z = z.reshape((time * batch, context, dim)).dimshuffle(1,0,2) # C(TB)D
        i = i.reshape((time, context, batch)).dimshuffle(1,0,2).reshape((context, time * batch))
        index = i
        num_batches = time * batch

      sequences = z
      sources = self.sources
      if encoder:
        if recurrent_transform == "attention_segment":
          if hasattr(encoder[0],'act'):
            outputs_info = [T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act)]
          else:
           # outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ]
            outputs_info[0] = self.aligner.output[-1]
        elif hasattr(encoder[0],'act'):
          outputs_info = [ T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act) ]
        else:
          outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ]
        sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0))
      else:
        outputs_info = [ T.alloc(numpy.cast[theano.config.floatX](0), num_batches, unit.n_units) for a in range(unit.n_act) ]

      if self.attrs['lm'] and self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm):
        if self.network.y[self.attrs['target']].ndim == 3:
          sequences += T.dot(self.network.y[self.attrs['target']],self.W_lm_out)
        else:
          y = self.y_in[self.attrs['target']].flatten()
          sequences += self.W_lm_out[y].reshape((index.shape[0],index.shape[1],unit.n_in))

      if sequences == self.b:
        sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0))

      if unit.recurrent_transform:
        outputs_info += unit.recurrent_transform.get_sorted_state_vars_initial()

      index_f = T.cast(index, theano.config.floatX)
      unit.set_parent(self)

      if segment_input:
        outputs = unit.scan_seg(x=sources,
                                z=sequences[s::self.attrs['sampling']],
                                att = inv_att,
                                non_sequences=non_sequences,
                                i=index_f,
                                outputs_info=outputs_info,
                                W_re=self.W_re,
                                W_in=self.W_in,
                                b=self.b,
                                go_backwards=direction == -1,
                                truncate_gradient=self.attrs['truncation'])
      else:
        outputs = unit.scan(x=sources,
                            z=sequences[s::self.attrs['sampling']],
                            non_sequences=non_sequences,
                            i=index_f,
                            outputs_info=outputs_info,
                            W_re=self.W_re,
                            W_in=self.W_in,
                            b=self.b,
                            go_backwards=direction == -1,
                            truncate_gradient=self.attrs['truncation'])

      if not isinstance(outputs, list):
        outputs = [outputs]
      if outputs:
        outputs[0].name = "%s.act[0]" % self.name
        if context > 0:
          for i in range(len(outputs)):
            outputs[i] = outputs[i][-1].reshape((outputs[i].shape[1]//n_batches,n_batches,outputs[i].shape[2]))

      if unit.recurrent_transform:
        unit.recurrent_transform_state_var_seqs = outputs[-len(unit.recurrent_transform.state_vars):]

      if self.attrs['sampling'] > 1:
        if s == 0:
          self.act = [ T.alloc(numpy.cast['float32'](0), self.index.shape[0], self.index.shape[1], n_out) for act in outputs ]
        self.act = [ T.set_subtensor(tot[s::self.attrs['sampling']], act) for tot,act in zip(self.act, outputs) ]
      else:
        self.act = outputs[:unit.n_act]
        if len(outputs) > unit.n_act:
          self.aux = outputs[unit.n_act:]
    if self.attrs['attention_store']:
      self.attention = [ self.aux[i].dimshuffle(0,2,1) for i,v in enumerate(sorted(unit.recurrent_transform.state_vars.keys())) if v.startswith('att_') ] # NBT
      for i in range(len(self.attention)):
        vec = T.eye(self.attention[i].shape[2], 1, -direction * (self.attention[i].shape[2] - 1))
        last = vec.dimshuffle(1, 'x', 0).repeat(self.index.shape[1], axis=1)
        self.attention[i] = T.concatenate([self.attention[i][1:],last],axis=0)[::direction]

    self.cost_val = numpy.float32(0)
    if recurrent_transform == 'attention_align':
      back = T.ceil(self.aux[sorted(unit.recurrent_transform.state_vars.keys()).index('t')])
      def make_output(base, yout, trace, length):
        length = T.cast(length, 'int32')
        idx = T.cast(trace[:length][::-1],'int32')
        x_out = T.concatenate([base[idx],T.zeros((self.index.shape[0] + 1 - length, base.shape[1]), 'float32')],axis=0)
        y_out = T.concatenate([yout[idx,T.arange(length)],T.zeros((self.index.shape[0] + 1 - length, ), 'float32')],axis=0)
        return x_out, y_out

      output, _ = theano.map(make_output,
                             sequences = [base[0].output.dimshuffle(1,0,2),
                                          self.y_t.dimshuffle(1,2,0),
                                          back.dimshuffle(1,0),
                                          T.sum(self.index,axis=0,dtype='float32')])
      self.attrs['n_out'] = base[0].attrs['n_out']
      self.params.update(unit.params)
      self.output = output[0].dimshuffle(1,0,2)[:-1]

      z = T.dot(self.act[0], self.T_W)[:-1] + self.T_b
      z = z.reshape((z.shape[0] * z.shape[1], z.shape[2]))
      idx = (self.index[1:].flatten() > 0).nonzero()
      idy = (self.index[1:][::-1].flatten() > 0).nonzero()
      y_out = T.cast(output[1],'int32').dimshuffle(1, 0)[:-1].flatten()
      nll, _ = T.nnet.crossentropy_softmax_1hot(x=z[idx], y_idx=y_out[idy])
      self.cost_val = T.sum(nll)
      recog = T.argmax(z[idx], axis=1)
      real = y_out[idy]
      self.errors = lambda: T.sum(T.neq(recog, real))

      return

      back += T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32')
      idx = (self.index[:-1].flatten() > 0).nonzero()
      idx = T.cast(back[::-1].flatten()[idx],'int32')
      x_out = base[0].output
      #x_out = x_out.dimshuffle(1,0,2).reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx]
      #x_out = x_out.reshape((self.index.shape[1], self.index.shape[0] - 1, x_out.shape[1])).dimshuffle(1,0,2)
      x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx]
      x_out = x_out.reshape((self.index.shape[0] - 1, self.index.shape[1], x_out.shape[1]))
      self.output = T.concatenate([x_out, base[0].output[1:]],axis=0)
      self.attrs['n_out'] = base[0].attrs['n_out']
      self.params.update(unit.params)
      return


      skips = T.dot(T.nnet.softmax(z), T.arange(z.shape[1], dtype='float32')).reshape(self.index[1:].shape)
      shift = T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32')
      skips = T.concatenate([T.zeros_like(self.y_t[:1]),self.y_t[:-1]],axis=0)
      idx = shift + T.cumsum(skips, axis=0)
      idx = T.cast(idx[:-1].flatten(),'int32')
      #idx = (idx.flatten() > 0).nonzero()
      #idx = base[0].attention.flatten()
      x_out = base[0].output[::-1]
      x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx]
      x_out = x_out.reshape((self.index.shape[0], self.index.shape[1], x_out.shape[1]))
      self.output = T.concatenate([base[0].output[-1:], x_out], axis=0)[::-1]
      self.attrs['n_out'] = base[0].attrs['n_out']
      self.params.update(unit.params)
      return

    if recurrent_transform == 'batch_norm':
      self.params['sample_mean_batch_norm'].custom_update = T.dot(T.mean(self.act[0],axis=[0,1]),self.W_re)
      self.params['sample_mean_batch_norm'].custom_update_normalized = True

    self.make_output(self.act[0][::direction or 1], sample_mean=sample_mean, gamma=gamma)
    self.params.update(unit.params)
示例#45
0
def add_fun(A, B, max_int, mem):
    """Returns the distribution for a sum of integers."""
    rows = [roll(B[:, ::-1], shift + 1, axis=1)
            for shift in range(max_int)]
    B_prime = stack(rows, axis=1).transpose(0, 2, 1)
    return batched_dot(A, B_prime), mem
示例#46
0
def negate_fun(A, max_int, mem):
    """Negate a distribution over integers."""
    return roll(A[:, ::-1], 1, axis=1), mem
示例#47
0
    def build_model(self):
        trng = RandomStreams(self.random_seed)
        use_noise = theano.shared(numpy_floatX(0.))
        # Simply encode this
        x = T.matrix('x', dtype='int64')
        y = T.matrix('y', dtype='int64')
        y_prime = T.roll(y, -1, 0)
        # Since we are simply predicting the next word, the
        # following statement shifts the content of the x by 1
        # in the time dimension for prediction (axis 0, assuming TxN)
        mask_x = T.matrix('mask_x', dtype=theano.config.floatX)
        mask_y = T.matrix('mask_y', dtype=theano.config.floatX)

        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        # Convert word indices to their embeddings
        # Resulting dims are (T x N x dim_proj)
        emb = self.tparams['Wemb'][x.flatten()].reshape([n_timesteps,
                                                         n_samples,
                                                         self.dim_proj])
        # Compute the hidden states
        # Note that these contain hidden states for elements which were
        # padded in input. The cost for these time steps are removed
        # before the calculation of the cost.
        enc_proj_1 = self.layers['enc_lstm_1'].lstm_layer(emb, self.dim_proj, mask=mask)
        # Use dropout on non-recurrent connections (Zaremba et al.)
        if self.use_dropout:
            proj_1 = dropout_layer(enc_proj_1, use_noise, trng)
        enc_proj_2 = self.layers['enc_lstm_2'].lstm_layer(enc_proj_1, self.dim_proj, mask=mask)
        if self.use_dropout:
            enc_proj_2 = dropout_layer(enc_proj_2, use_noise, trng)

        # Use the final state of the encoder as the initial hidden state of the decoder
        src_embedding = enc_proj_2[-1]
        # Run decoder LSTM
        dec_proj_1 = self.layers['enc_lstm_1'].lstm_layer(emb, self.dim_proj, mask=mask)
        # Use dropout on non-recurrent connections (Zaremba et al.)
        if self.use_dropout:
            proj_1 = dropout_layer(enc_proj_1, use_noise, trng)
        enc_proj_2 = self.layers['enc_lstm_2'].lstm_layer(enc_proj_1, self.dim_proj, mask=mask)
        if self.use_dropout:
            enc_proj_2 = dropout_layer(enc_proj_2, use_noise, trng)

        pre_s = T.dot(proj, self.tparams['U']) + self.tparams['b']
        # Softmax works for 2-tensors (matrices) only. We have a 3-tensor
        # TxNxV. So we reshape it to (T*N)xV, apply softmax and reshape again
        # -1 is a proxy for infer dim based on input (numpy style)
        pre_s_r = T.reshape(pre_s, (pre_s.shape[0] * pre_s.shape[1], -1))
        pred_r = T.nnet.softmax(pre_s_r)

        off = 1e-8
        if pred_r.dtype == 'float16':
            off = 1e-6

        # Note the use of flatten here. We can't directly index a 3-tensor
        # and hence we use the (T*N)xV view which is indexed by the flattened
        # label matrix, dim = (T*N)x1
        # Also, the cost (before calculating the mean) is multiplied (element-wise)
        # with the mask to eliminate the cost of elements that do not really exist.
        # i.e. Do not include the cost for elements which are padded
        cost = -T.sum(T.log(pred_r[T.arange(pred_r.shape[0]), y.flatten()] + off) * mask.flatten()) / T.sum(mask)

        self.f_cost = theano.function([x, mask], cost, name='f_cost')

        return use_noise, x, mask, cost
示例#48
0
        def _step_state(v_h_, x_h_, v_t_, x_t_, a_t_, a, is_aggressive):

            next_x_t_ = tt.roll(x_t_,-1)

            relx = next_x_t_ - x_t_

            # fix the jump between -pi and +pi
            relx = (relx>=0) *relx + (relx<0)*(self.two_pi_r + relx)

            relx_to_host = x_h_ - x_t_

            is_host_cipv = (x_h_ > -0.5*self.host_length) * (x_h_ > x_t_) * (relx_to_host < relx)

            # If host CIPV - Change relx to him
            relx = (is_host_cipv * ((x_h_ > 0)*x_h_ - x_t_)) + ((1-is_host_cipv) * relx)

            is_host_approaching = (x_h_ > -1.5*self.host_length) * (x_h_ <= -0.5 *self.host_length) * (x_t_ < 0) * (x_t_ > -0.25*self.two_pi_r) * ((next_x_t_ > 0) + (next_x_t_ < x_t_))

            accel_default = 5*(relx - 2*v_t_)

            # accel_is_aggressive = (3 - v_t_)/self.dt
            accel_is_aggressive = tt.maximum(3, 3*(relx_to_host - 1.5*v_t_))

            accel_not_aggressive = (0.5 - v_t_)/self.dt

            accel_host_approaching = is_aggressive * accel_is_aggressive + (1 - is_aggressive) * accel_not_aggressive

            accel = is_host_approaching * accel_host_approaching + (1- is_host_approaching) * accel_default

            #1. exact next state

            #1.1 host
            v_h = v_h_ + self.dt * a

            # clip host speed to the section [0,v0]
            v_h = tt.clip(v_h, 0, 3*self.v_0)

            x_h = x_h_ + self.dt * v_h

            x_h = (x_h>=(self.two_pi_r/2)) * (x_h - self.two_pi_r) + (x_h < (self.two_pi_r/2)) * x_h

            #1.2 targets
            v_t_e = tt.maximum(0, v_t_ + self.dt * accel)

            x_t_e = x_t_ + self.dt * v_t_e

            a_t_e = v_t_e - v_t_

            #2. learn the transition model between states
            state_      = tt.concatenate([v_h_ , x_h_, tt.flatten(v_t_), tt.flatten(x_t_), tt.flatten(a_t_), a])
            state_t_e   = tt.concatenate([tt.flatten(v_t_e), tt.flatten(x_t_e), tt.flatten(a_t_e)])

            state_ = common.disconnected_grad(state_)
            state_t_e = common.disconnected_grad(state_t_e)

            h0 = tt.dot(state_, self.W_t_0) + self.b_t_0
            relu0 = tt.nnet.relu(h0)

            h1 = tt.dot(relu0, self.W_t_1) + self.b_t_1
            relu1 = tt.nnet.relu(h1)

            h2 = tt.dot(relu1, self.W_t_2) + self.b_t_2
            relu2 = tt.nnet.relu(h2)

            state_t_hat = tt.dot(relu2, self.W_t_c)

            cost_transition = tt.mean(tt.abs_(state_t_hat - state_t_e))

            v_t_a = (state_t_hat[0 : self.n_t]).dimshuffle(0,'x')
            x_t_a = (state_t_hat[self.n_t : 2 * self.n_t]).dimshuffle(0,'x')
            a_t_a = (state_t_hat[2 * self.n_t : ]).dimshuffle(0,'x')

            #3. prediction noise
            n_v_t = v_t_e - v_t_a
            n_x_t = x_t_e - x_t_a
            n_a_t = a_t_e - a_t_a

            #4. disconnect the gradient of the noise signals
            n_v_t = common.disconnected_grad(n_v_t)
            n_x_t = common.disconnected_grad(n_x_t)
            n_a_t = common.disconnected_grad(n_a_t)

            #5. add the noise to the approximation
            v_t = v_t_a + n_v_t
            x_t = x_t_a + n_x_t
            a_t = a_t_a + n_a_t

            # apply [-pi,pi] discontinuity
            x_t = (x_t>=(self.two_pi_r/2)) * (x_t - self.two_pi_r) + (x_t < (self.two_pi_r/2)) * x_t

            return v_h, x_h, v_t, x_t, a_t, cost_transition
示例#49
0
import os
import time
from RNN_theano import RNN_theano
import numpy as np
import theano as theano
import theano.tensor as T
from RNN_theano import train_with_sgd

li = T.matrix('list')
m = T.scalar("m")
n = T.scalar("n")
outf = theano.function([m,n], 2*m + 3*n)
cost = 2*m + 3*n
dm = T.grad(cost, m)
dn = T.grad(cost, n)
rollf = theano.function([li], T.roll(li, -1, 1))
entry = [[0.5,1.1,3],  [0.8,0.1,3] , [0.3,1.5,3] ]
res = rollf(entry)

oll = theano.function([m,n], [dm])
oll2 = theano.function([m,n], [dn])

value = oll(4,5)
value2 = oll2(4,5)

inputd = 2
outputd = 1
hiddend = 20

U1 = np.random.uniform(-np.sqrt(1./inputd), np.sqrt(1./inputd), (hiddend, inputd))
V1 = np.random.uniform(-np.sqrt(1./outputd), np.sqrt(1./outputd), (outputd, hiddend))
示例#50
0
    def __init__(self, rng, x, n_in, n_h, n_out, p, training, y=None, rnn_batch_training=False):
        """ This is to initialise a standard RNN hidden unit

        :param rng: random state, fixed value for randome state for reproducible objective results
        :param x: input data to current layer
        :param n_in: dimension of input data
        :param n_h: number of hidden units/blocks
        :param n_out: dimension of output data
        :param p: the probability of dropout
        :param training: a binary value to indicate training or testing (for dropout training)
        """
        self.input = x
        if y is not None:
            self.groundtruth = y

        if p > 0.0:
            if training==1:
                srng = RandomStreams(seed=123456)
                self.input = T.switch(srng.binomial(size=x.shape,p=p), x, 0)
            else:
                self.input =  (1-p) * x #(1-p) *

        self.n_in  = int(n_in)
        self.n_h   = int(n_h)
        self.n_out = int(n_out)

        self.rnn_batch_training = rnn_batch_training

        # random initialisation
        Wx_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_in), size=(n_in, n_h)), dtype=config.floatX)
        #Wh_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_h), size=(n_h, n_h)), dtype=config.floatX)
        #Wy_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_out), size=(n_out, n_h)), dtype=config.floatX)
        Ux_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_in), size=(n_in, n_out)), dtype=config.floatX)
        #Uh_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_h), size=(n_h, n_out)), dtype=config.floatX)
        #Uy_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_out), size=(n_out, n_out)), dtype=config.floatX)

        # identity matrix initialisation
        Wh_value = np.asarray(np.eye(n_h, n_h), dtype=config.floatX)
        Wy_value = np.asarray(np.eye(n_out, n_h), dtype=config.floatX)
        Uh_value = np.asarray(np.eye(n_in, n_out), dtype=config.floatX)
        Uy_value = np.asarray(np.zeros(n_out, n_out), dtype=config.floatX)

        # Input gate weights
        self.W_xi = theano.shared(value=Wx_value, name='W_xi')
        self.W_hi = theano.shared(value=Wh_value, name='W_hi')
        self.W_yi = theano.shared(value=Wy_value, name='W_yi')

        # Output gate weights
        self.U_xi = theano.shared(value=Ux_value, name='U_xi')
        self.U_hi = theano.shared(value=Uh_value, name='U_hi')
        self.U_yi = theano.shared(value=Uy_value, name='U_yi')

        # bias
        self.b_i = theano.shared(value=np.zeros((n_h, ), dtype=config.floatX), name='b_i')
        self.b   = theano.shared(value=np.zeros((n_out, ), dtype=config.floatX), name='b')

        # initial value of hidden and cell state and output
        if self.rnn_batch_training:
            self.h0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'h0')
            self.c0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'c0')
            self.y0 = theano.shared(value=np.zeros((1, n_out), dtype = config.floatX), name = 'y0')

            self.h0 = T.repeat(self.h0, x.shape[1], 0)
            self.c0 = T.repeat(self.c0, x.shape[1], 0)
            self.y0 = T.repeat(self.c0, x.shape[1], 0)
        else:
            self.h0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'h0')
            self.c0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'c0')
            self.y0 = theano.shared(value=np.zeros((n_out, ), dtype = config.floatX), name = 'y0')

        self.h0 = self.input[-1, 0:-4] # hard coded to remove coarse coding features

        self.outytm1 = T.roll(self.groundtruth, 1, 0)
        
        self.Wix = T.dot(self.input, self.W_xi)
        self.Uix = T.dot(self.input, self.U_xi)

        [self.h, self.c], _ = theano.scan(self.recurrent_as_activation_function, sequences = [self.Wix, self.Wiy],
                                                                      outputs_info = [self.h0, self.c0])

        self.y = self.Uix + self.Uiy + T.dot(self.h, self.U_hi) + self.b
        self.output = T.nnet.softmax(self.y)
        
        # recurrent output params and additional input params
        self.params = [self.W_xi, self.W_hi, self.W_yi, self.U_xi, self.U_hi, self.U_yi, self.b_i, self.b]

        self.L2_cost = (self.W_xi ** 2).sum() + (self.W_hi ** 2).sum() + (self.W_yi ** 2).sum() + (self.U_hi ** 2).sum()
示例#51
0
文件: lm.py 项目: anirudh9119/mscale
def get_interpolated_hiddens(old_hidden,  n_timesteps,
                             n_samples, interpolation_mask,
                             number_cons_hiddens):
    '''
        old_hidden: old_hidden_matrix which needs to be interpolated.
                  : number_of_hiddens * batch_size * Hidden_Size
        number_of_reduced_timstamps
        alphas  = [1, 0.8, 0.6, 0.4, 0.2]
        alpha is the interpolation mask as of now, which
        ne  eds to be passed as a function parameter.
        For ex, given hiddens, h1, h2, h3, h_n-1
        You get, [h1, h2], [h2,  h3], [h_n-2, h_n-1] so basically, n-1 pairs.
        Number of interolations need to be done. i.e relative clock times.
    '''
    alpha = interpolation_mask
    hidden_size = 1024
    batch_size = 32


    num_cons_hiddens = number_cons_hiddens
    num_reduced_hiddens = num_cons_hiddens + 1
    number_interp = len(interpolation_mask)

    X  = old_hidden.dimshuffle(1, 0, 2)
    new_matrix2 = repeat(X, 2, axis=1)
    new_matrix2 = tensor.roll(new_matrix2, -1, axis=1)
    new_matrix2 = new_matrix2[:, 0:2*num_reduced_hiddens-2, :]
    new_matrix2 = new_matrix2.reshape([n_samples, num_cons_hiddens, 2, hidden_size])

    def _step_slice(m_, interp_mask):
        interp_ret = []
        for i in range(number_interp):
            interp_ret.append(interp_mask[i] * m_[0] + (1-interp_mask[i])* m_[1])
        return interp_ret

    _step = _step_slice

    def step_batch(m_, alpha):
        seqs = m_
        rval, updates = theano.scan(_step,
                                    sequences=seqs,
                                    non_sequences=[alpha])
        return rval

    _batch_step = step_batch
    seqs = new_matrix2
    rval, updates = theano.scan(_batch_step,
                                sequences=seqs,
                                non_sequences=[alpha])
    out=[]
    out_batch =[]
    for batch_index in range(batch_size):
        for i in range(num_cons_hiddens):
            something =  [rval[j][batch_index][i] for j in range(number_interp)]
            if i==0:
                out = something
            if i >=1:
                out  = tensor.concatenate([out, something], axis=0)
        if batch_index == 0:
            out_batch = out
        if batch_index == 1:
            out_batch = tensor.stacklists([out_batch, out])
        if batch_index > 1:
            out = tensor.reshape(out,[1, n_timesteps-2, hidden_size])
            out_batch = tensor.concatenate([out_batch, out])

    zero_pad = tensor.zeros([out_batch.shape[0], number_interp , out_batch.shape[2]])
    out_batch = tensor.concatenate([zero_pad, out_batch], axis=1)
    return out_batch
示例#52
0
 def mycost(y_true, y_pred):
     y_roll=T.roll(y_pred,1,axis=1)
     y_roll=T.set_subtensor(y_roll[:,0,:],y_pred[:,0,:])
     return T.mean(T.square(y_pred - y_true), axis=-1)+ 10*T.mean(T.square(y_pred - y_roll), axis=-1) 
def roll_and_sum(prior_result, orig):
    res = prior_result + orig
    res = T.roll(res, 1, axis=-1)
    return res
示例#54
0
    def build_encoder(self, x, xmask=None, **kwargs):
        one_step = False
        if len(kwargs):
            one_step = True
         
        # if x.ndim == 2 then 
        # x = (n_steps, batch_size)
        if x.ndim == 2:
            batch_size = x.shape[1]
        # else x = (word_1, word_2, word_3, ...)
        # or x = (last_word_1, last_word_2, last_word_3, ..)
        # in this case batch_size is 
        else:
            batch_size = 1
        
        # if it is not one_step then we initialize everything to 0  
        if not one_step:
            h_0 = T.alloc(np.float32(0), batch_size, self.qdim)
            hs_0 = T.alloc(np.float32(0), batch_size, self.sdim) 
        # in sampling mode (i.e. one step) we require 
        else:
            # in this case x.ndim != 2
            assert x.ndim != 2
            assert 'prev_h' in kwargs 
            assert 'prev_hs' in kwargs
            h_0 = kwargs['prev_h']
            hs_0 = kwargs['prev_hs']

        xe = self.approx_embedder(x)
        if xmask == None:
            xmask = T.neq(x, self.eos_sym)
        
        # Here we roll the mask so we avoid the need for separate
        # hr and h. The trick is simple: if the original mask is
        # 0 1 1 0 1 1 1 0 0 0 0 0 -- batch is filled with eos_sym
        # the rolled mask will be
        # 0 0 1 1 0 1 1 1 0 0 0 0 -- roll to the right
        # ^ ^
        # two resets </s> <s>
        # the first reset will reset h_init = 0
        # the second will reset </s> and update given x_t = <s>
        if xmask.ndim == 2:
            rolled_xmask = T.roll(xmask, 1, axis=0)
        else:
            rolled_xmask = T.roll(xmask, 1) 

        # Gated Encoder
        if self.sent_step_type == "gated":
            f_enc = self.gated_sent_step
            o_enc_info = [h_0, None, None, None]
        else:
            f_enc = self.plain_sent_step
            o_enc_info = [h_0]

        if self.triple_step_type == "gated":
            f_hier = self.gated_triple_step
            o_hier_info = [hs_0, None, None, None]
        else:
            f_hier = self.plain_triple_step
            o_hier_info = [hs_0]
        
        # Run through all the sentence (encode everything)
        if not one_step: 
            _res, _ = theano.scan(f_enc,
                              sequences=[xe, rolled_xmask],\
                              outputs_info=o_enc_info) 
        # Make just one step further
        else:
            _res = f_enc(xe, rolled_xmask, h_0)
        # Get the hidden state sequence
        h = _res[0]
        
        # All hierarchical sentence
        # The hs sequence is based on the original mask
        if not one_step:
            _res,  _ = theano.scan(f_hier,\
                               sequences=[h, xmask],\
                               outputs_info=o_hier_info)
        # Just one step further
        else:
            _res = f_hier(h, xmask, hs_0)

        if isinstance(_res, list) or isinstance(_res, tuple):
            hs = _res[0]
        else:
            hs = _res

        return h, hs 
示例#55
0
def roll(x, shift, axis=-1):
    return T.roll(x, shift, axis=axis)
示例#56
0
 def get_output(self, train):
     X = self.get_input(train)
     tensors = [ T.roll(X, off, axis=self.axis)  for off in self.offsets ]
     return T.stack(tensors, axis=self.offset_axis)
示例#57
0
 def get_output_mask(self, train=False):
     X = self.get_input_mask(train)
     if X is None:
         return None
     tensors = [ T.roll(X, off, axis=self.axis)  for off in self.offsets ]
     return T.stack(tensors, axis=self.offset_axis)
示例#58
0
 def activation(self, network, in_vw):
     in_var = in_vw.variable
     return in_var * T.roll(in_var, shift=1, axis=1)