示例#1
0
文件: nnjm.py 项目: comadan/nn
    def __init__(self, dset, hps, opt_hps, train=True, opt='nag'):

        super(NNJM, self).__init__(dset, hps, train=train)
        self.ctc_loader = CTCLoader(SOURCE_CONTEXT*NUM_CHARS, dset.batch_size, dset.subset)
        self.nl = get_nl(hps.nl)

        self.alloc_params()

        if train:
            self.opt = create_optimizer(opt, self, alpha=opt_hps.alpha,
                    mom=opt_hps.mom, mom_low=opt_hps.mom_low,
                    low_mom_iters=opt_hps.low_mom_iters)
示例#2
0
文件: nnjm.py 项目: comadan/nn
class NNJM(Net):

    def __init__(self, dset, hps, opt_hps, train=True, opt='nag'):

        super(NNJM, self).__init__(dset, hps, train=train)
        self.ctc_loader = CTCLoader(SOURCE_CONTEXT*NUM_CHARS, dset.batch_size, dset.subset)
        self.nl = get_nl(hps.nl)

        self.alloc_params()

        if train:
            self.opt = create_optimizer(opt, self, alpha=opt_hps.alpha,
                    mom=opt_hps.mom, mom_low=opt_hps.mom_low,
                    low_mom_iters=opt_hps.low_mom_iters)

    @staticmethod
    def init_hyperparams():
        return NNJMHyperparams()

    def alloc_params(self):
        hps = self.hps

        self.params['Wih'] = vp_init((hps.hidden_size, hps.input_size))
        self.params['Wsh'] = vp_init((hps.hidden_size, hps.source_size))
        self.params['bih'] = zeros((hps.hidden_size, 1))

        for k in xrange(hps.hidden_layers - 1):
            self.params['W%d' % (k+1)] = vp_init((hps.hidden_size, hps.hidden_size))
            self.params['b%d' % (k+1)] = zeros((hps.hidden_size, 1))

        self.params['Who'] = vp_init((hps.output_size, hps.hidden_size))
        self.params['bho'] = zeros((hps.output_size, 1))

        self.count_params()

        # Allocate grads as well

        self.grads = {}
        for k in self.params:
            self.grads[k] = empty(self.params[k].shape)
        logger.info('Allocated gradients')

    def run(self, back=True):
        super(NNJM, self).run(back=back)

        data, labels = self.dset.get_batch()
        data = one_hot(data, self.hps.output_size)
        data = data.reshape((-1, data.shape[2]))
        source_data = self.ctc_loader.get_batch()

        #cost, grads = self.cost_and_grad((data, source_data), labels)
        #self.check_grad((data, source_data), labels, grads, params_to_check=['Wsh'], eps=0.01)

        #print labels
        #print np.argmax(source_data, axis=0)

        if back:
            self.update_params((data, source_data), labels)
        else:
            cost, probs = self.cost_and_grad((data, source_data), labels, back=False)
            return cost, probs

    def cost_and_grad(self, data_and_source_data, labels, back=True):
        data, source_data = data_and_source_data
        hps = self.hps
        grads = self.grads

        # May not be full batch size if at end of dataset
        bsize = data.shape[-1]

        p = ParamStruct(**self.params)

        # Forward prop

        acts = list()
        acts.append(self.nl(mult(p.Wih, data) + mult(p.Wsh, source_data) + p.bih))
        #acts.append(self.nl(mult(p.Wsh, source_data) + p.bih))

        for k in xrange(hps.hidden_layers - 1):
            W = self.params['W%d' % (k+1)]
            b = self.params['b%d' % (k+1)]
            acts.append(self.nl(mult(W, acts[-1]) + b))

        y = mult(p.Who, acts[-1]) + p.bho
        probs = softmax(y)

        if labels is None:
            return None, probs

        # NOTE For more precision if necessary convert to nparray early
        cost_array = np.empty(bsize, dtype=np.float64)
        # Speed things up by doing assignments off gpu
        neg_log_prob = -1 * np.log(as_np(probs))
        for k in xrange(bsize):
            cost_array[k] = neg_log_prob[labels[k], k]
        cost = cost_array.sum() / bsize

        if not back:
            return cost, probs

        # Backprop

        for k in self.grads:
            self.grads[k][:] = 0

        # Do assignments off GPU to speed things up
        dLdy = as_np(probs)
        # NOTE This changes probs
        for k in xrange(bsize):
            dLdy[labels[k], k] -= 1
        dLdy = array(dLdy)

        grads['bho'] = dLdy.sum(axis=1).reshape((-1, 1))
        grads['Who'] = mult(dLdy, acts[-1].T)
        Ws = [None] + [self.params['W%d' % (k+1)] for k in xrange(hps.hidden_layers - 1)] + [p.Who]
        deltas = [dLdy]

        for k in reversed(xrange(hps.hidden_layers - 1)):
            delta = get_nl_grad(self.hps.nl, acts[k+1]) * mult(Ws[k + 2].T, deltas[-1])
            deltas.append(delta)
            grads['b%d' % (k+1)] = delta.sum(axis=1).reshape((-1, 1))
            grads['W%d' % (k+1)] = mult(delta, acts[k].T)

        delta = get_nl_grad(self.hps.nl, acts[0]) * mult(Ws[1].T, deltas[-1])
        grads['bih'] = delta.sum(axis=1).reshape((-1, 1))
        grads['Wih'] = mult(delta, data.T)
        grads['Wsh'] = mult(delta, source_data.T)

        # Normalize
        for k in self.grads:
            self.grads[k] /= bsize

        return cost, self.grads

    def start_next_epoch(self):
        self.dset.restart(shuffle=True)
        self.ctc_loader.restart(shuffle=True)