示例#1
0
文件: rnn.py 项目: runngezhang/nn-1
    def alloc_params(self):
        # Refer to Ch. 2 pg. 10 of Sutskever's thesis

        hps = self.hps

        # Initial hidden state
        self.params['h0'] = zeros((hps.hidden_size, hps.hidden_layers))

        # Input to hidden, note if first layer is recurrent bih is redundant
        self.params['Wih'] = vp_init((hps.hidden_size, hps.output_size))
        self.params['bih'] = zeros((hps.hidden_size, 1))

        # recurrent weight
        # NOTE Initialization important for grad check, don't use vp_init?
        self.params['Whh'] = vp_init((hps.hidden_size, hps.hidden_size))
        self.params['bhh'] = zeros((hps.hidden_size, 1))

        # Weights between hidden layers
        for k in xrange(1, hps.hidden_layers):
            self.params['Wh%d' % k] = vp_init(
                (hps.hidden_size, hps.hidden_size))
            self.params['bh%d' % k] = zeros((hps.hidden_size, 1))

        # Hidden to output
        self.params['Who'] = vp_init((hps.output_size, hps.hidden_size))
        self.params['bho'] = zeros((hps.output_size, 1))

        # Keep around last hidden state in case want to resume RNN from there
        self.last_h = None

        self.count_params()
示例#2
0
文件: rnn.py 项目: comadan/nn
    def alloc_params(self):
        # Refer to Ch. 2 pg. 10 of Sutskever's thesis

        hps = self.hps

        # Initial hidden state
        self.params['h0'] = zeros((hps.hidden_size, hps.hidden_layers))

        # Input to hidden, note if first layer is recurrent bih is redundant
        self.params['Wih'] = vp_init((hps.hidden_size, hps.output_size))
        self.params['bih'] = zeros((hps.hidden_size, 1))

        # recurrent weight
        # NOTE Initialization important for grad check, don't use vp_init?
        self.params['Whh'] = vp_init((hps.hidden_size, hps.hidden_size))
        self.params['bhh'] = zeros((hps.hidden_size, 1))

        # Weights between hidden layers
        for k in xrange(1, hps.hidden_layers):
            self.params['Wh%d' % k] = vp_init((hps.hidden_size, hps.hidden_size))
            self.params['bh%d' % k] = zeros((hps.hidden_size, 1))

        # Hidden to output
        self.params['Who'] = vp_init((hps.output_size, hps.hidden_size))
        self.params['bho'] = zeros((hps.output_size, 1))

        # Keep around last hidden state in case want to resume RNN from there
        self.last_h = None

        self.count_params()
示例#3
0
文件: dnn.py 项目: comadan/nn
    def alloc_params(self):
        hps = self.hps

        self.params['Wih'] = vp_init((hps.hidden_size, hps.input_size))
        self.params['bih'] = zeros((hps.hidden_size, 1))

        for k in xrange(hps.hidden_layers - 1):
            self.params['W%d' % (k+1)] = vp_init((hps.hidden_size, hps.hidden_size))
            self.params['b%d' % (k+1)] = zeros((hps.hidden_size, 1))

        self.params['Who'] = vp_init((hps.output_size, hps.hidden_size))
        self.params['bho'] = zeros((hps.output_size, 1))

        self.count_params()
示例#4
0
    def alloc_params(self):
        hps = self.hps

        self.params['Wih'] = vp_init((hps.hidden_size, hps.input_size))
        self.params['bih'] = zeros((hps.hidden_size, 1))

        for k in xrange(hps.hidden_layers - 1):
            self.params['W%d' % (k + 1)] = vp_init(
                (hps.hidden_size, hps.hidden_size))
            self.params['b%d' % (k + 1)] = zeros((hps.hidden_size, 1))

        self.params['Who'] = vp_init((hps.output_size, hps.hidden_size))
        self.params['bho'] = zeros((hps.output_size, 1))

        self.count_params()
示例#5
0
文件: mom.py 项目: comadan/nn
    def __init__(self, model, alpha=1e-3, mom=0.95, mom_low=0.5, low_mom_iters=100, max_grad=None, rmsprop=False, rmsprop_decay=0.99):
        super(MomentumOptimizer, self).__init__(model, alpha)
        # Momentum coefficient
        self.mom = mom
        self.mom_low = mom_low
        self.low_mom_iters = low_mom_iters
        self.max_grad = max_grad
        self.grad_norm = 0.0

        # Velocities
        self.vel = dict()
        if self.mom > 0:
            for p in self.params:
                self.vel[p] = zeros(self.params[p].shape)
            self.updates = self.vel
        else:
            self.vel = self.updates = dict()

        # Keep track of cost and smoothed cost
        self.costs = list()
        self.expcosts = list()

        self.rmsprop = rmsprop
        self.rmsprop_decay = rmsprop_decay
        if rmsprop:
            # Scale gradients by exponentially weighted average of magnitudes
            self.msgrads = dict()
            for p in self.params:
                self.msgrads[p] = None
示例#6
0
文件: nodes.py 项目: xiamike/nn
 def bprop(self):
     logger.debug("%s backprop" % str(self))
     logger.debug("labels shape: %s" % str(self.labels.shape))
     # NOTE Assumes ObjectiveNode has no successors
     batch_size = self.labels.size
     self.full_grad = zeros(self.pred.out.shape)
     for k in range(self.labels.size):
         self.full_grad[self.labels[k], k] = -1.0 / batch_size * (1 / self.pred.out[self.labels[k], k])
示例#7
0
 def bprop(self):
     logger.debug('%s backprop' % str(self))
     logger.debug('labels shape: %s' % str(self.labels.shape))
     # NOTE Assumes ObjectiveNode has no successors
     batch_size = self.labels.size
     self.full_grad = zeros(self.pred.out.shape)
     for k in range(self.labels.size):
         self.full_grad[
             self.labels[k],
             k] = -1.0 / batch_size * (1 / self.pred.out[self.labels[k], k])
示例#8
0
文件: nplm_graph.py 项目: comadan/nn
 def alloc_params(self):
     rand_init = lambda shape: rand(shape, rand_range)
     # PARAM Following Vaswani et al. EMNLP 2013
     bias_init = lambda shape: zeros(shape) - np.log(self.vocab_size)
     # NOTE IndexedParamNode allocates batch of values indexed from C
     self.C = IndexedParamNode('x = C[:, ks]', self.dset, (embed_size, self.vocab_size), init_fn=rand_init)
     self.H = ParamNode('H', (hidden_size, context_size*embed_size), init_fn=rand_init)
     self.d = ParamNode('d', (hidden_size, 1), init_fn=bias_init)
     self.U = ParamNode('U', (self.vocab_size, hidden_size), init_fn=rand_init)
     self.b = ParamNode('b', (self.vocab_size, 1), init_fn=bias_init)
     self.W = ParamNode('W', (self.vocab_size, context_size*embed_size), init_fn=rand_init)
     self.param_nodes = [self.C, self.H, self.d, self.U, self.b, self.W]
     logger.info('Allocated parameters')
示例#9
0
文件: nnjm.py 项目: comadan/nn
    def alloc_params(self):
        hps = self.hps

        self.params['Wih'] = vp_init((hps.hidden_size, hps.input_size))
        self.params['Wsh'] = vp_init((hps.hidden_size, hps.source_size))
        self.params['bih'] = zeros((hps.hidden_size, 1))

        for k in xrange(hps.hidden_layers - 1):
            self.params['W%d' % (k+1)] = vp_init((hps.hidden_size, hps.hidden_size))
            self.params['b%d' % (k+1)] = zeros((hps.hidden_size, 1))

        self.params['Who'] = vp_init((hps.output_size, hps.hidden_size))
        self.params['bho'] = zeros((hps.output_size, 1))

        self.count_params()

        # Allocate grads as well

        self.grads = {}
        for k in self.params:
            self.grads[k] = empty(self.params[k].shape)
        logger.info('Allocated gradients')
示例#10
0
 def _batch_data(batch):
     images = float_tensor(batch[0].float())
     bsize = len(images)
     return m(
         images=images,
         x1=float_tensor(batch[3].float()),
         x2=float_tensor(batch[4].float()),
         id_labels=init(batch[1]),
         pose_labels=init(batch[2]),
         fake_pose_labels=long_tensor(np.random.randint(args.Np,
                                                        size=bsize)),
         ones=ones(bsize),
         zeros=zeros(bsize),
         noise=float_tensor(np.random.uniform(-1., 1., (bsize, args.Nz))))
示例#11
0
 def alloc_params(self):
     rand_init = lambda shape: rand(shape, rand_range)
     # PARAM Following Vaswani et al. EMNLP 2013
     bias_init = lambda shape: zeros(shape) - np.log(self.vocab_size)
     # NOTE IndexedParamNode allocates batch of values indexed from C
     self.C = IndexedParamNode('x = C[:, ks]',
                               self.dset, (embed_size, self.vocab_size),
                               init_fn=rand_init)
     self.H = ParamNode('H', (hidden_size, context_size * embed_size),
                        init_fn=rand_init)
     self.d = ParamNode('d', (hidden_size, 1), init_fn=bias_init)
     self.U = ParamNode('U', (self.vocab_size, hidden_size),
                        init_fn=rand_init)
     self.b = ParamNode('b', (self.vocab_size, 1), init_fn=bias_init)
     self.W = ParamNode('W', (self.vocab_size, context_size * embed_size),
                        init_fn=rand_init)
     self.param_nodes = [self.C, self.H, self.d, self.U, self.b, self.W]
     logger.info('Allocated parameters')
示例#12
0
文件: mom.py 项目: zhangjiulong/nn
    def __init__(self,
                 model,
                 alpha=1e-3,
                 mom=0.95,
                 mom_low=0.5,
                 low_mom_iters=100,
                 max_grad=None,
                 rmsprop=False,
                 rmsprop_decay=0.99):
        super(MomentumOptimizer, self).__init__(model, alpha)
        # Momentum coefficient
        self.mom = mom
        self.mom_low = mom_low
        self.low_mom_iters = low_mom_iters
        self.max_grad = max_grad
        self.grad_norm = 0.0

        # Velocities
        self.vel = dict()
        if self.mom > 0:
            for p in self.params:
                self.vel[p] = zeros(self.params[p].shape)
            self.updates = self.vel
        else:
            self.vel = self.updates = dict()

        # Keep track of cost and smoothed cost
        self.costs = list()
        self.expcosts = list()

        self.rmsprop = rmsprop
        self.rmsprop_decay = rmsprop_decay
        if rmsprop:
            # Scale gradients by exponentially weighted average of magnitudes
            self.msgrads = dict()
            for p in self.params:
                self.msgrads[p] = None
示例#13
0
文件: rnn.py 项目: comadan/nn
    def cost_and_grad(self, data, labels, back=True, prev_h0=None):
        hps = self.hps
        T = data.shape[1]
        bsize = data.shape[2]

        # FIXME gnumpy reallocates if try and use same parameters?
        #us = self.us[:, 0:T, 0:bsize]
        #dus = self.dus[:, 0:T, 0:bsize]
        #hs = self.hs[:, 0:T, 0:bsize]
        #dhs = self.dhs[:, 0:T, 0:bsize]
        #probs = self.probs[:, 0:T, 0:bsize]
        #dprobs = self.dprobs[:, 0:T, 0:bsize]
        #costs = self.costs[0:T, 0:bsize]

        us = list()
        dus = list()
        hs = list()
        dhs = list()
        h0 = list()
        for k in xrange(hps.hidden_layers):
            us.append(list())
            dus.append(list())
            hs.append(list())
            dhs.append(list())
            h0.append(empty((hps.hidden_size, bsize)))
            for t in xrange(T):
                us[k].append(zeros((hps.hidden_size, bsize)))
                dus[k].append(zeros((hps.hidden_size, bsize)))
                hs[k].append(zeros((hps.hidden_size, bsize)))
                dhs[k].append(zeros((hps.hidden_size, bsize)))
        probs = list()
        for t in xrange(T):
            probs.append(zeros((hps.output_size, bsize)))
        costs = np.zeros((T, bsize))
        if prev_h0 is not None:
            h0 = prev_h0
        else:
            for k in xrange(hps.hidden_layers):
                h0[k] = tile(self.params['h0'][:, k].reshape(-1, 1), bsize)
        bih = self.params['bih']
        Wih = self.params['Wih']
        Whh = self.params['Whh']
        bhh = self.params['bhh']
        Who = self.params['Who']
        bho = self.params['bho']

        # Forward prop

        for t in xrange(T):
            for k in xrange(hps.hidden_layers):
                if t == 0:
                    hprev = h0[k]
                else:
                    hprev = hs[k][t-1]

                if k == 0:
                    us[k][t] = mult(Wih, data[:, t, :]) + bih
                else:
                    us[k][t] = mult(self.params['Wh%d' % k], hs[k-1][t])

                if k == hps.recurrent_layer - 1:
                    us[k][t] += mult(Whh, hprev) + bhh
                    # Clip maximum activation
                    mask = us[k][t] < hps.max_act
                    us[k][t] = us[k][t] * mask + hps.max_act * (1 - mask)
                elif k != 0:
                    us[k][t] += self.params['bh%d' % k]

                hs[k][t] = self.nl(us[k][t])

            probs[t] = softmax(mult(Who, hs[-1][t]) + bho)

        self.last_h = list()
        for k in xrange(hps.hidden_layers):
            self.last_h.append(hs[k][-1])

        if labels is None:
            return None, probs

        probs_neg_log = list()
        dprobs = list()
        for t in xrange(T):
            probs_neg_log.append(as_np(-1 * log(probs[t])))
            dprobs.append(as_np(probs[t].copy()))
        for k in xrange(bsize):
            for t in xrange(len(labels[k])):
                costs[t, k] = probs_neg_log[t][labels[k][t], k]
                dprobs[t][labels[k][t], k] -= 1
        for t in xrange(T):
            dprobs[t] = array(dprobs[t])

        # NOTE Summing costs over time
        # NOTE FIXME Dividing by T to get better sense if objective
        # is decreasing, remove for grad checking
        cost = costs.sum() / bsize / float(T)
        if not back:
            return cost, probs

        # Backprop

        for k in self.grads:
            self.grads[k][:] = 0

        for t in reversed(xrange(T)):
            self.grads['bho'] += dprobs[t][:, :].sum(axis=-1).reshape((-1, 1)) / bsize
            self.grads['Who'] += mult(dprobs[t], hs[-1][t].T) / bsize

            for k in reversed(xrange(hps.hidden_layers)):
                if k == hps.hidden_layers - 1:
                    dhs[k][t] += mult(Who.T, dprobs[t])
                else:
                    dhs[k][t] += mult(self.params['Wh%d' % (k+1)].T, dhs[k+1][t])
                dus[k][t] += get_nl_grad(self.hps.nl, us[k][t]) * dhs[k][t]

                if k > 0:
                    self.grads['Wh%d' % k] += mult(dus[k][t], hs[k-1][t].T) / bsize
                    self.grads['bh%d' % k] += dus[k][t].sum(axis=-1).reshape((-1, 1)) / bsize

                if k == hps.recurrent_layer - 1:
                    if t == 0:
                        hprev = h0[k]
                        self.grads['h0'][:, k] = mult(Whh.T, dus[k][t]).sum(axis=-1) / bsize
                    else:
                        hprev = hs[k][t-1]
                        dhs[k][t-1] = mult(Whh.T, dus[k][t])
                    self.grads['Whh'] += mult(dus[k][t], hprev.T) / bsize
                    self.grads['bhh'] += dus[k][t].sum(axis=-1).reshape((-1, 1)) / bsize

            self.grads['Wih'] += mult(dus[0][t], data[:, t, :].T) / bsize
            self.grads['bih'] += dus[0][t].sum(axis=-1).reshape((-1, 1)) / bsize

        return cost, self.grads
示例#14
0
文件: rnn.py 项目: runngezhang/nn-1
    def cost_and_grad(self, data, labels, back=True, prev_h0=None):
        hps = self.hps
        T = data.shape[1]
        bsize = data.shape[2]

        # FIXME gnumpy reallocates if try and use same parameters?
        #us = self.us[:, 0:T, 0:bsize]
        #dus = self.dus[:, 0:T, 0:bsize]
        #hs = self.hs[:, 0:T, 0:bsize]
        #dhs = self.dhs[:, 0:T, 0:bsize]
        #probs = self.probs[:, 0:T, 0:bsize]
        #dprobs = self.dprobs[:, 0:T, 0:bsize]
        #costs = self.costs[0:T, 0:bsize]

        us = list()
        dus = list()
        hs = list()
        dhs = list()
        h0 = list()
        for k in xrange(hps.hidden_layers):
            us.append(list())
            dus.append(list())
            hs.append(list())
            dhs.append(list())
            h0.append(empty((hps.hidden_size, bsize)))
            for t in xrange(T):
                us[k].append(zeros((hps.hidden_size, bsize)))
                dus[k].append(zeros((hps.hidden_size, bsize)))
                hs[k].append(zeros((hps.hidden_size, bsize)))
                dhs[k].append(zeros((hps.hidden_size, bsize)))
        probs = list()
        for t in xrange(T):
            probs.append(zeros((hps.output_size, bsize)))
        costs = np.zeros((T, bsize))
        if prev_h0 is not None:
            h0 = prev_h0
        else:
            for k in xrange(hps.hidden_layers):
                h0[k] = tile(self.params['h0'][:, k].reshape(-1, 1), bsize)
        bih = self.params['bih']
        Wih = self.params['Wih']
        Whh = self.params['Whh']
        bhh = self.params['bhh']
        Who = self.params['Who']
        bho = self.params['bho']

        # Forward prop

        for t in xrange(T):
            for k in xrange(hps.hidden_layers):
                if t == 0:
                    hprev = h0[k]
                else:
                    hprev = hs[k][t - 1]

                if k == 0:
                    us[k][t] = mult(Wih, data[:, t, :]) + bih
                else:
                    us[k][t] = mult(self.params['Wh%d' % k], hs[k - 1][t])

                if k == hps.recurrent_layer - 1:
                    us[k][t] += mult(Whh, hprev) + bhh
                    # Clip maximum activation
                    mask = us[k][t] < hps.max_act
                    us[k][t] = us[k][t] * mask + hps.max_act * (1 - mask)
                elif k != 0:
                    us[k][t] += self.params['bh%d' % k]

                hs[k][t] = self.nl(us[k][t])

            probs[t] = softmax(mult(Who, hs[-1][t]) + bho)

        self.last_h = list()
        for k in xrange(hps.hidden_layers):
            self.last_h.append(hs[k][-1])

        if labels is None:
            return None, probs

        probs_neg_log = list()
        dprobs = list()
        for t in xrange(T):
            probs_neg_log.append(as_np(-1 * log(probs[t])))
            dprobs.append(as_np(probs[t].copy()))
        for k in xrange(bsize):
            for t in xrange(len(labels[k])):
                costs[t, k] = probs_neg_log[t][labels[k][t], k]
                dprobs[t][labels[k][t], k] -= 1
        for t in xrange(T):
            dprobs[t] = array(dprobs[t])

        # NOTE Summing costs over time
        # NOTE FIXME Dividing by T to get better sense if objective
        # is decreasing, remove for grad checking
        cost = costs.sum() / bsize / float(T)
        if not back:
            return cost, probs

        # Backprop

        for k in self.grads:
            self.grads[k][:] = 0

        for t in reversed(xrange(T)):
            self.grads['bho'] += dprobs[t][:, :].sum(axis=-1).reshape(
                (-1, 1)) / bsize
            self.grads['Who'] += mult(dprobs[t], hs[-1][t].T) / bsize

            for k in reversed(xrange(hps.hidden_layers)):
                if k == hps.hidden_layers - 1:
                    dhs[k][t] += mult(Who.T, dprobs[t])
                else:
                    dhs[k][t] += mult(self.params['Wh%d' % (k + 1)].T,
                                      dhs[k + 1][t])
                dus[k][t] += get_nl_grad(self.hps.nl, us[k][t]) * dhs[k][t]

                if k > 0:
                    self.grads['Wh%d' %
                               k] += mult(dus[k][t], hs[k - 1][t].T) / bsize
                    self.grads['bh%d' % k] += dus[k][t].sum(axis=-1).reshape(
                        (-1, 1)) / bsize

                if k == hps.recurrent_layer - 1:
                    if t == 0:
                        hprev = h0[k]
                        self.grads['h0'][:, k] = mult(
                            Whh.T, dus[k][t]).sum(axis=-1) / bsize
                    else:
                        hprev = hs[k][t - 1]
                        dhs[k][t - 1] = mult(Whh.T, dus[k][t])
                    self.grads['Whh'] += mult(dus[k][t], hprev.T) / bsize
                    self.grads['bhh'] += dus[k][t].sum(axis=-1).reshape(
                        (-1, 1)) / bsize

            self.grads['Wih'] += mult(dus[0][t], data[:, t, :].T) / bsize
            self.grads['bih'] += dus[0][t].sum(axis=-1).reshape(
                (-1, 1)) / bsize

        return cost, self.grads