예제 #1
0
 def evaluate_perplexity(self, data, nll_func):
     nll_preds = []
     nll_masks = []
     for idbs, idts in data:
         nll, mask = nll_func(idbs, idts)
         assert nll.shape == mask.shape
         nll_preds.append(nll)
         nll_masks.append(mask)
     avg_nll = evaluate_average(predictions=nll_preds, masks=nll_masks)
     return np.exp(avg_nll)
예제 #2
0
파일: main.py 프로젝트: AlTheEngineer/rcnn
 def evaluate_perplexity(self, data, nll_func):
     nll_preds = [ ]
     nll_masks = [ ]
     for idbs, idts in data:
         nll, mask = nll_func(idbs, idts)
         assert nll.shape == mask.shape
         nll_preds.append(nll)
         nll_masks.append(mask)
     avg_nll = evaluate_average(
                 predictions = nll_preds,
                 masks = nll_masks
             )
     return np.exp(avg_nll)
예제 #3
0
    def train(self, args, train, dev, test=None):
        embedding_layer = self.layers[0]

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        batch_size = args["batch_size"]
        unroll_size = args["unroll_size"]

        train = create_batches(train, embedding_layer.map_to_ids, batch_size)

        dev = create_batches(dev, embedding_layer.map_to_ids, batch_size)

        if test is not None:
            test = create_batches(test, embedding_layer.map_to_ids, batch_size)

        cost = T.sum(self.nll) / self.idxs.shape[1]
        updates, lr, gnorm = create_optimization_updates(
            cost=cost,
            params=self.params,
            lr=args["learning_rate"],
            beta1=args["beta1"],
            beta2=args["beta2"],
            rho=args["rho"],
            momentum=args["momentum"],
            gamma=args["gamma"],
            eps=args["eps"],
            method=args["learning"])[:3]
        #if args["learning"] == "adadelta":
        #    lr.set_value(args["learning_rate"])

        train_func = theano.function(
            inputs=[self.idxs, self.idys, self.init_state],
            outputs=[cost, self.last_state, gnorm],
            updates=updates)
        eval_func = theano.function(
            inputs=[self.idxs, self.idys, self.init_state],
            outputs=[self.nll, self.last_state])

        N = (len(train[0]) - 1) / unroll_size + 1
        say(" train: {} tokens, {} mini-batches\n".format(
            len(train[0].ravel()), N))
        say(" dev: {} tokens\n".format(len(dev[0].ravel())))

        say("\tp_norm: {}\n".format(self.get_pnorm_stat()))

        decay_lr = args["decay_lr"] and args["learning"].lower() != "adadelta" and \
                    args["learning"].lower() != "adagrad"
        lr_0 = args["learning_rate"]
        iter_cnt = 0

        unchanged = 0
        best_dev = 1e+10
        start_time = 0
        max_epoch = args["max_epoch"]
        for epoch in xrange(max_epoch):
            if unchanged > 5: break
            start_time = time.time()

            prev_state = np.zeros((batch_size, self.n_d * 2),
                                  dtype=theano.config.floatX)

            train_loss = 0.0
            for i in xrange(N):
                # get current batch
                x = train[0][i * unroll_size:(i + 1) * unroll_size]
                y = train[1][i * unroll_size:(i + 1) * unroll_size]

                iter_cnt += 1
                if decay_lr:
                    lr.set_value(np.float32(lr_0 / iter_cnt**0.5))
                cur_loss, prev_state, grad_norm = train_func(x, y, prev_state)
                train_loss += cur_loss / len(x)

                if math.isnan(cur_loss) or math.isnan(grad_norm):
                    say("\nNaN !!\n")
                    return

                if i % 10 == 0:
                    say("\r{}".format(i))

                if i == N - 1:
                    self.dropout.set_value(0.0)
                    dev_preds = self.evaluate(eval_func, dev, batch_size,
                                              unroll_size)
                    dev_loss = evaluate_average(predictions=dev_preds,
                                                masks=None)
                    dev_ppl = np.exp(dev_loss)
                    self.dropout.set_value(dropout_prob)

                    say("\r\n")
                    say( ( "Epoch={}  lr={:.3f}  train_loss={:.3f}  train_ppl={:.1f}  " \
                        +"dev_loss={:.3f}  dev_ppl={:.1f}\t|g|={:.3f}\t[{:.1f}m]\n" ).format(
                            epoch,
                            float(lr.get_value(borrow=True)),
                            train_loss/N,
                            np.exp(train_loss/N),
                            dev_loss,
                            dev_ppl,
                            float(grad_norm),
                            (time.time()-start_time)/60.0
                        ))
                    say("\tp_norm: {}\n".format(self.get_pnorm_stat()))

                    # halve the learning rate
                    #if args["learning"] == "sgd" and dev_ppl > best_dev-1:
                    #    lr.set_value(np.max([lr.get_value()/2.0, np.float32(0.0001)]))

                    if dev_ppl < best_dev:
                        best_dev = dev_ppl
                        if test is None: continue
                        self.dropout.set_value(0.0)
                        test_preds = self.evaluate(eval_func, test, batch_size,
                                                   unroll_size)
                        test_loss = evaluate_average(predictions=test_preds,
                                                     masks=None)
                        test_ppl = np.exp(test_loss)
                        self.dropout.set_value(dropout_prob)
                        say("\tbest_dev={:.1f}  test_loss={:.3f}  test_ppl={:.1f}\n"
                            .format(best_dev, test_loss, test_ppl))
                    if best_dev > 200: unchanged += 1

        say("\n")
예제 #4
0
    def train(self, args, train, dev, test=None):
        embedding_layer = self.layers[0]

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        batch_size = args["batch_size"]
        unroll_size = args["unroll_size"]

        train = create_batches(train, embedding_layer.map_to_ids, batch_size)

        dev = create_batches(dev, embedding_layer.map_to_ids, batch_size)

        if test is not None:
            test = create_batches(test, embedding_layer.map_to_ids, batch_size)

        cost = T.sum(self.nll) / self.idxs.shape[1]
        updates, lr, gnorm = create_optimization_updates(
                cost = cost,
                params = self.params,
                lr = args["learning_rate"],
                beta1 = args["beta1"],
                beta2 = args["beta2"],
                rho = args["rho"],
                momentum = args["momentum"],
                gamma = args["gamma"],
                eps = args["eps"],
                method = args["learning"]
            )[:3]
        #if args["learning"] == "adadelta":
        #    lr.set_value(args["learning_rate"])

        train_func = theano.function(
                inputs = [ self.idxs, self.idys, self.init_state ],
                outputs = [cost, self.last_state, gnorm ],
                updates = updates
            )
        eval_func = theano.function(
                inputs = [ self.idxs, self.idys, self.init_state ],
                outputs = [self.nll, self.last_state ]
            )

        N = (len(train[0])-1)/unroll_size + 1
        say(" train: {} tokens, {} mini-batches\n".format(
                len(train[0].ravel()), N
            ))
        say(" dev: {} tokens\n".format(len(dev[0].ravel())))

        say("\tp_norm: {}\n".format(
                self.get_pnorm_stat()
            ))

        decay_lr = args["decay_lr"] and args["learning"].lower() != "adadelta" and \
                    args["learning"].lower() != "adagrad"
        lr_0 = args["learning_rate"]
        iter_cnt = 0

        unchanged = 0
        best_dev = 1e+10
        start_time = 0
        max_epoch = args["max_epoch"]
        for epoch in xrange(max_epoch):
            if unchanged > 5: break
            start_time = time.time()

            prev_state = np.zeros((batch_size, self.n_d*2),
                            dtype=theano.config.floatX)

            train_loss = 0.0
            for i in xrange(N):
                # get current batch
                x = train[0][i*unroll_size:(i+1)*unroll_size]
                y = train[1][i*unroll_size:(i+1)*unroll_size]

                iter_cnt += 1
                if decay_lr:
                    lr.set_value(np.float32(lr_0/iter_cnt**0.5))
                cur_loss, prev_state, grad_norm = train_func(x, y, prev_state)
                train_loss += cur_loss/len(x)

                if math.isnan(cur_loss) or math.isnan(grad_norm):
                    say("\nNaN !!\n")
                    return

                if i % 10 == 0:
                    say("\r{}".format(i))

                if i == N-1:
                    self.dropout.set_value(0.0)
                    dev_preds = self.evaluate(eval_func, dev, batch_size, unroll_size)
                    dev_loss = evaluate_average(
                            predictions = dev_preds,
                            masks = None
                        )
                    dev_ppl = np.exp(dev_loss)
                    self.dropout.set_value(dropout_prob)

                    say("\r\n")
                    say( ( "Epoch={}  lr={:.3f}  train_loss={:.3f}  train_ppl={:.1f}  " \
                        +"dev_loss={:.3f}  dev_ppl={:.1f}\t|g|={:.3f}\t[{:.1f}m]\n" ).format(
                            epoch,
                            float(lr.get_value(borrow=True)),
                            train_loss/N,
                            np.exp(train_loss/N),
                            dev_loss,
                            dev_ppl,
                            float(grad_norm),
                            (time.time()-start_time)/60.0
                        ))
                    say("\tp_norm: {}\n".format(
                            self.get_pnorm_stat()
                        ))

                    # halve the learning rate
                    #if args["learning"] == "sgd" and dev_ppl > best_dev-1:
                    #    lr.set_value(np.max([lr.get_value()/2.0, np.float32(0.0001)]))

                    if dev_ppl < best_dev:
                        best_dev = dev_ppl
                        if test is None: continue
                        self.dropout.set_value(0.0)
                        test_preds = self.evaluate(eval_func, test, batch_size, unroll_size)
                        test_loss = evaluate_average(
                                predictions = test_preds,
                                masks = None
                            )
                        test_ppl = np.exp(test_loss)
                        self.dropout.set_value(dropout_prob)
                        say("\tbest_dev={:.1f}  test_loss={:.3f}  test_ppl={:.1f}\n".format(
                                best_dev, test_loss, test_ppl))
                    if best_dev > 200: unchanged += 1

        say("\n")
예제 #5
0
    def train(self, args, train, dev, test=None):
        embedding_layer = self.layers[-2]

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        rnn_dropout_prob = np.float64(args["rnn_dropout"]).astype(
            theano.config.floatX)
        batch_size = args["batch_size"]
        unroll_size = args["unroll_size"]

        train = create_batches(train, embedding_layer.map_to_ids, batch_size)

        dev = create_batches(dev, embedding_layer.map_to_ids, 1)

        if test is not None:
            test = create_batches(test, embedding_layer.map_to_ids, 1)

        cost = T.sum(self.nll) / self.idxs.shape[1]
        updates, lr, gnorm = create_optimization_updates(
            cost=cost,
            params=self.params,
            lr=args["learning_rate"],
            eps=args["eps"],
            method=args["learning"])[:3]

        train_func = theano.function(inputs=[self.idxs, self.idys] +
                                     self.init_state,
                                     outputs=[cost, gnorm] + self.last_state,
                                     updates=updates)
        eval_func = theano.function(
            inputs=[self.idxs, self.idys] + self.init_state,
            outputs=[self.nll] + self.last_state,
        )

        N = (len(train[0]) - 1) / unroll_size + 1
        say(" train: {} tokens, {} mini-batches\n".format(
            len(train[0].ravel()), N))
        say(" dev: {} tokens\n".format(len(dev[0].ravel())))

        say("\tp_norm: {}\n".format(self.get_pnorm_stat()))

        decay_epoch = args["lr_decay_epoch"]
        decay_rate = args["lr_decay"]
        lr_0 = args["learning_rate"]
        iter_cnt = 0

        depth = args["depth"]
        unchanged = 0
        best_dev = 1e+10
        start_time = 0
        max_epoch = args["max_epoch"]
        for epoch in xrange(max_epoch):
            unchanged += 1
            if unchanged > 20: break

            if decay_epoch > 0 and epoch >= decay_epoch:
                lr.set_value(np.float32(lr.get_value() * decay_rate))

            start_time = time.time()

            prev_state = [
                np.zeros((batch_size, self.n_d), dtype=theano.config.floatX)
                for i in xrange(depth * 2)
            ]

            train_loss = 0.0
            for i in xrange(N):
                # get current batch
                x = train[0][i * unroll_size:(i + 1) * unroll_size]
                y = train[1][i * unroll_size:(i + 1) * unroll_size]

                iter_cnt += 1
                ret = train_func(x, y, *prev_state)
                cur_loss, grad_norm, prev_state = ret[0], ret[1], ret[2:]
                train_loss += cur_loss / len(x)

                if i % 10 == 0:
                    say("\r{}".format(i))

                if i == N - 1:
                    self.dropout.set_value(0.0)
                    self.rnn_dropout.set_value(0.0)
                    dev_preds = self.evaluate(eval_func, dev, 1, unroll_size)
                    dev_loss = evaluate_average(predictions=dev_preds,
                                                masks=None)
                    dev_ppl = np.exp(dev_loss)
                    self.dropout.set_value(dropout_prob)
                    self.rnn_dropout.set_value(rnn_dropout_prob)

                    say("\r\n")
                    say( ( "Epoch={}  lr={:.4f}  train_loss={:.3f}  train_ppl={:.1f}  " \
                        +"dev_loss={:.3f}  dev_ppl={:.1f}\t|g|={:.3f}\t[{:.1f}m]\n" ).format(
                            epoch,
                            float(lr.get_value(borrow=True)),
                            train_loss/N,
                            np.exp(train_loss/N),
                            dev_loss,
                            dev_ppl,
                            float(grad_norm),
                            (time.time()-start_time)/60.0
                        ))
                    say("\tp_norm: {}\n".format(self.get_pnorm_stat()))

                    if dev_ppl < best_dev:
                        best_dev = dev_ppl
                        if test is None: continue
                        self.dropout.set_value(0.0)
                        self.rnn_dropout.set_value(0.0)
                        test_preds = self.evaluate(eval_func, test, 1,
                                                   unroll_size)
                        test_loss = evaluate_average(predictions=test_preds,
                                                     masks=None)
                        test_ppl = np.exp(test_loss)
                        self.dropout.set_value(dropout_prob)
                        self.rnn_dropout.set_value(rnn_dropout_prob)
                        say("\tbest_dev={:.1f}  test_loss={:.3f}  test_ppl={:.1f}\n"
                            .format(best_dev, test_loss, test_ppl))
                        if best_dev < 200: unchanged = 0

        say("\n")