Пример #1
0
 def evaluate(self, test_data):
     test_loss = 0
     self.nnlm.eval()
     with torch.no_grad():
         for ctx, tgt in batch_iter(test_data, self.args, self.wd_vocab):
             pred, _ = self.nnlm(ctx)
             loss = self._calc_loss(pred, tgt)
             test_loss += loss.data.item()
     print('test data loss: %.3f' % (test_loss / len(test_data)))
Пример #2
0
    def train(self, train_pairs, enc_optimizer, dec_optimizer, args, src_vocab,
              tgt_vocab):
        train_loss = 0
        for src_batch, tgt_batch in batch_iter(train_pairs, args, src_vocab,
                                               tgt_vocab):
            loss = 0
            # enc_out: (batch_size, seq_len, hidden_size * nb_directions)
            # enc_hidden: (num_layers * nb_directions, batch_size, hidden_size)
            enc_out, enc_hidden = self.encoder(src_batch.src_idxs,
                                               mask=src_batch.non_pad_mask)

            self.encoder.zero_grad()
            self.decoder.zero_grad()

            dec_hidden = enc_hidden
            dec_input = tgt_batch.src_idxs[0].unsqueeze(1)
            if np.random.uniform(0, 1) <= args.teacher_force:
                # print('以目标作为下一个输入')
                for i in range(1, tgt_batch.src_idxs.size(0)):
                    dec_out, dec_hidden = self.decoder(dec_input, dec_hidden,
                                                       enc_out)
                    dec_hidden *= tgt_batch.non_pad_mask[i].unsqueeze(
                        1).repeat(1, dec_hidden.size(-1))
                    loss += self.calc_loss(dec_out, tgt_batch.src_idxs[i])
                    train_loss += loss.data.item()

                    dec_input = tgt_batch.src_idxs[i].unsqueeze(1)
            else:
                # print('以网络的预测输出作为下一个输入')
                for i in range(1, tgt_batch.src_idxs.size(0)):
                    dec_out, dec_hidden = self.decoder(dec_input, dec_hidden,
                                                       enc_out)
                    dec_hidden *= tgt_batch.non_pad_mask[i].unsqueeze(
                        1).repeat(1, dec_hidden.size(-1))
                    loss += self.calc_loss(dec_out, tgt_batch.src_idxs[i])
                    train_loss += loss.data.item()

                    _, top_i = dec_out.data.topk(1)
                    dec_input = top_i  # (batch_size, 1)

            loss.backward()

            nn_utils.clip_grad_norm_(filter(lambda p: p.requires_grad,
                                            self.encoder.parameters()),
                                     max_norm=5.0)
            nn_utils.clip_grad_norm_(filter(lambda p: p.requires_grad,
                                            self.decoder.parameters()),
                                     max_norm=5.0)

            enc_optimizer.step()
            dec_optimizer.step()

        return train_loss / len(train_pairs)
Пример #3
0
    def validate(self, valid_data):
        val_loss = 0
        self.nnlm.eval()
        with torch.no_grad():
            for ctx, tgt in batch_iter(valid_data, self.args, self.wd_vocab):
                pred, _ = self.nnlm(ctx)
                loss = self._calc_loss(pred, tgt)
                val_loss += loss.data.item()

        val_loss /= len(valid_data)

        return val_loss
Пример #4
0
    def train(self, train_data, dev_data):
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      self._model.parameters()),
                               lr=self._args.lr)

        lr_scheduler = optim.lr_scheduler.StepLR(optimizer,
                                                 step_size=5,
                                                 gamma=0.5)

        for ep in range(self._args.epoch):
            self._model.train()
            start = time.time()
            train_loss = 0
            nb_correct, nb_gold, nb_pred = 0, 0, 0

            lr_scheduler.step()
            print(lr_scheduler.get_lr())
            for batch in batch_iter(train_data,
                                    self._args.batch_size,
                                    self._wd_vocab,
                                    self._ch_vocab,
                                    device=self._args.device):
                self._model.zero_grad()
                loss = self._model.calc_loss(batch.wd_src, batch.ch_src,
                                             batch.tgt)
                # GPU并行计算,自定义方法无法并行(在主GPU中运行),需要加.module调用
                # loss = self._model.module.calc_loss(batch.wd_src, batch.ch_src, batch.tgt)
                train_loss += loss.data.item()
                loss.backward()
                # 梯度裁剪 - 解决梯度爆炸问题
                nn.utils.clip_grad_value_(filter(lambda p: p.requires_grad,
                                                 self._model.parameters()),
                                          clip_value=5.0)  # 限制梯度值在[-5, 5]之间

                optimizer.step()

                pred = self._model(batch.wd_src, batch.ch_src)
                result = self._calc_acc(pred, batch.tgt, batch.mask)
                nb_correct += result[0]
                nb_gold += result[1]
                nb_pred += result[2]

            train_f1 = self._calc_f1(nb_correct, nb_gold, nb_pred)
            print('[Epoch %d] train_loss: %.3f train_F1: %.3f' %
                  (ep, train_loss, train_f1))

            dev_loss, dev_f1 = self._validate(dev_data)
            end = time.time()
            print('dev_loss: %.3f dev_F1: %.3f' % (dev_loss, dev_f1))
            print('time cost: %.2f s' % (end - start))
Пример #5
0
    def train_iter(self, train_data, args, vocab, optimizer):
        self.parser_model.train()

        train_loss = 0
        all_arc_acc, all_rel_acc, all_arcs = 0, 0, 0
        start_time = time.time()
        nb_batch = int(np.ceil(len(train_data) / args.batch_size))
        batch_size = args.batch_size // args.update_steps
        for i, batcher in enumerate(
                batch_iter(train_data, batch_size, vocab, True)):
            batcher = (x.to(args.device) for x in batcher)
            wd_idx, extwd_idx, tag_idx, true_head_idx, true_rel_idx, non_pad_mask, _ = batcher

            pred_arc_score, pred_rel_score = self.parser_model(
                wd_idx, extwd_idx, tag_idx, non_pad_mask)

            loss = self.calc_loss(pred_arc_score, pred_rel_score,
                                  true_head_idx, true_rel_idx, non_pad_mask)
            if args.update_steps > 1:
                loss = loss / args.update_steps
            loss_val = loss.data.item()
            train_loss += loss_val
            loss.backward()

            arc_acc, rel_acc, total_arcs = self.calc_acc(
                pred_arc_score, pred_rel_score, true_head_idx, true_rel_idx,
                non_pad_mask)
            all_arc_acc += arc_acc
            all_rel_acc += rel_acc
            all_arcs += total_arcs

            ARC = all_arc_acc * 100. / all_arcs
            REL = all_rel_acc * 100. / all_arcs
            logger.info('Iter%d ARC: %.3f%%, REL: %.3f%%' % (i + 1, ARC, REL))
            logger.info('time cost: %.2fs, train loss: %.2f' %
                        ((time.time() - start_time), loss_val))

            # 梯度累积,相对于变相增大batch_size,节省存储
            if (i + 1) % args.update_steps == 0 or (i == nb_batch - 1):
                nn.utils.clip_grad_norm_(filter(
                    lambda p: p.requires_grad, self.parser_model.parameters()),
                                         max_norm=5.)
                optimizer.step()
                self.parser_model.zero_grad()

        train_loss /= len(train_data)
        ARC = all_arc_acc * 100. / all_arcs
        REL = all_rel_acc * 100. / all_arcs

        return train_loss, ARC, REL
Пример #6
0
    def train(self, train_data, dev_data):
        # 优化器:更新模型参数
        optimizer = optim.Adam(
            filter(lambda p: p.requires_grad, self._model.parameters()),
            lr=self._args.learning_rate,  # 1e-2
            weight_decay=self._args.weight_decay)  # 0

        # optimizer = optim.SGD(filter(lambda p: p.requires_grad, self._model.parameters()),
        #                       lr=self._args.learning_rate,  # 1e-2
        #                       momentum=0.9,
        #                       weight_decay=self._args.weight_decay,  # 1e-5
        #                       nesterov=True)

        lr_scheduler = optim.lr_scheduler.LambdaLR(
            optimizer, lr_lambda=lambda ep: 0.95**ep)

        # 迭代更新
        for i in range(self._args.epochs):
            self._model.train()

            start = time.time()
            train_loss = 0

            lr_scheduler.step()
            for batch_data in batch_iter(train_data,
                                         self._args.batch_size,
                                         self._vocab,
                                         device=self._args.device):
                self._model.zero_grad()
                pred, tgt = self._model(batch_data.wd_src)
                loss = self._calc_loss(pred, tgt)
                train_loss += loss.data.item()
                loss.backward()
                # gradient exploding
                nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad,
                                                (self._model.parameters())),
                                         max_norm=1.0)

                optimizer.step()

            dev_loss = self._validate(dev_data)
            end = time.time()

            print('learning rate:', lr_scheduler.get_lr())
            print('[Epoch %d] train loss: %.3f  dev loss: %.3f' %
                  (i, train_loss, dev_loss))
            print('time cost: %.3f' % (end - start))
Пример #7
0
    def train(self, train_data, valid_data):
        if self.args.optm == 'Adam':
            optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                          self.nnlm.parameters()),
                                   lr=self.args.lr)
        else:
            # n-gram语言模型使用SGD效果更好!
            optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                         self.nnlm.parameters()),
                                  lr=self.args.lr,
                                  momentum=0.9,
                                  weight_decay=self.args.weight_decay,
                                  nesterov=True)

        lr_scheduler = optim.lr_scheduler.LambdaLR(
            optimizer,
            lr_lambda=lambda epoch: max(1 - epoch / self.args.epoch, 1e-4))

        # optimizer = nn.DataParallel(optimizer, device_ids=[0, 1])

        for ep in range(self.args.epoch):
            self.nnlm.train()

            start = time.time()
            train_loss = 0
            hidden = None
            lr_scheduler.step()
            for ctx, tgt in batch_iter(train_data,
                                       self.args,
                                       self.wd_vocab,
                                       shuffle=False):
                self.nnlm.zero_grad()
                pred, hidden = self.nnlm(ctx, hidden)
                loss = self._calc_loss(pred, tgt)
                train_loss += loss.data.item()
                # 第一次反向传播之后,计算图的内存就会被释放掉,这样的话再次进行反向传播就不行了
                loss.backward()
                optimizer.step()
                # optimizer.module.step()  # optimizer在DataParallel中,需要变成普通的Adam才能调用step

            end = time.time()
            print('[Epoch %d] train loss: %.3f' %
                  (ep + 1, train_loss / len(train_data)))
            val_loss = self.validate(valid_data)
            print('dev loss: %.3f' % val_loss)
            print('lr: ', lr_scheduler.get_lr())
            print('time cost: %.2fs' % (end - start))
Пример #8
0
    def _validate(self, dev_data):
        dev_loss = 0
        nb_correct, nb_gold, nb_pred = 0, 0, 0
        self._model.eval()
        with torch.no_grad():
            for batch in batch_iter(dev_data,
                                    self._args.batch_size,
                                    self._wd_vocab,
                                    self._ch_vocab,
                                    device=self._args.device):
                loss = self._model.calc_loss(batch.wd_src, batch.ch_src,
                                             batch.tgt)
                dev_loss += loss.data.item()

                pred = self._model(batch.wd_src, batch.ch_src)
                result = self._calc_acc(pred, batch.tgt, batch.mask)
                nb_correct += result[0]
                nb_gold += result[1]
                nb_pred += result[2]

        return dev_loss, self._calc_f1(nb_correct, nb_gold, nb_pred)
Пример #9
0
    def _validate(self, dev_data):
        self._model.eval()

        dev_loss = 0
        cos_sim_lst = []
        with torch.no_grad():  # 确保在代码执行期间没有计算和存储梯度, 起到预测加速作用
            for batch_data in batch_iter(dev_data,
                                         self._args.batch_size,
                                         self._vocab,
                                         device=self._args.device):
                pred, tgt = self._model(batch_data.wd_src)
                loss = self._calc_loss(pred, tgt)
                dev_loss += loss.data.item()
                _, pred_enc = self._model.encoder(pred,
                                                  batch_data.non_pad_mask)
                _, tgt_enc = self._model.encoder(tgt, batch_data.non_pad_mask)
                cos_sim_lst.append(
                    self._cosine_sim(pred_enc[0][-1], tgt_enc[0][-1]))
            print('cosine similarity:', sum(cos_sim_lst) / len(cos_sim_lst))

        return dev_loss
Пример #10
0
    def evaluate(self, test_data, args, vocab):
        self.parser_model.eval()

        all_arc_acc, all_rel_acc, all_arcs = 0, 0, 0
        with torch.no_grad():
            for batcher in batch_iter(test_data, args.batch_size, vocab):
                batcher = (x.to(args.device) for x in batcher)
                wd_idx, extwd_idx, tag_idx, true_head_idx, true_rel_idx, non_pad_mask, punc_mask = batcher

                pred_arc_score, pred_rel_score = self.parser_model(
                    wd_idx, extwd_idx, tag_idx, non_pad_mask)

                arc_acc, rel_acc, total_arcs = self.metric_evaluate(
                    pred_arc_score, pred_rel_score, true_head_idx,
                    true_rel_idx, non_pad_mask, punc_mask)
                all_arc_acc += arc_acc
                all_rel_acc += rel_acc
                all_arcs += total_arcs

        uas = all_arc_acc * 100. / all_arcs
        las = all_rel_acc * 100. / all_arcs
        return uas, las
Пример #11
0
    def evaluate(self, test_data):
        vail_loss = 0
        nb_correct, nb_gold, nb_pred = 0, 0, 0
        self._model.eval()
        with torch.no_grad():
            for batch in batch_iter(test_data,
                                    self._args.batch_size,
                                    self._wd_vocab,
                                    self._ch_vocab,
                                    device=self._args.device):
                loss = self._model.calc_loss(batch.wd_src, batch.ch_src,
                                             batch.tgt)
                vail_loss += loss.data.item()

                pred = self._model(batch.wd_src, batch.ch_src)
                result = self._calc_acc(pred, batch.tgt, batch.mask)
                nb_correct += result[0]
                nb_gold += result[1]
                nb_pred += result[2]

        test_f1 = self._calc_f1(nb_correct, nb_gold, nb_pred)
        print('======== test_loss: %.3f test_F1: %.3f ========' %
              (vail_loss, test_f1))
        return vail_loss, test_f1
Пример #12
0
    def evaluate(self, test_pairs, args, src_vocab, tgt_vocab):
        self.encoder.eval()
        self.decoder.eval()
        # pred_wds, tgt_wds = [], []
        for src_batch, tgt_batch in batch_iter(test_pairs, args, src_vocab,
                                               tgt_vocab):
            # batch_pred_wds, batch_tgt_wds = [], []
            enc_out, enc_hidden = self.encoder(src_batch.src_idxs,
                                               mask=src_batch.non_pad_mask)

            # 保存历史分数
            seq_len, batch_size = tgt_batch.src_idxs.size()
            # (bz, beam_size)
            hist_score = torch.zeros((batch_size, args.beam_size),
                                     device=args.device)
            # (beam_size, bz, vocab_size)
            beam_score = torch.zeros(
                (args.beam_size, batch_size, tgt_vocab.vocab_size),
                device=args.device)
            # (bz, beam_size, max_len)
            best_paths = torch.zeros((MAX_LEN, batch_size, args.beam_size),
                                     device=args.device)

            dec_hidden = enc_hidden
            dec_input = tgt_batch.src_idxs[0].unsqueeze(1)
            for i in range(1, min(MAX_LEN, seq_len)):
                if i == 1:
                    # dec_input: (bz, 1)
                    # dec_out: (bz, vocab_size)
                    dec_out, dec_hidden = self.decoder(dec_input, dec_hidden,
                                                       enc_out)
                    dec_hidden *= tgt_batch.non_pad_mask[i].unsqueeze(
                        1).repeat(1, dec_hidden.size(-1))
                    # (bz, beam_size)
                    top_prob, top_idxs = dec_out.data.topk(args.beam_size,
                                                           dim=1)
                    hist_score = top_prob
                    best_paths[i] = top_idxs
                    # (bz, beam_size)
                    dec_input = top_idxs
                else:
                    # dec_input: (bz, beam_size) -> (beam_size, bz)
                    dec_input = dec_input.transpose(0, 1)
                    for j in range(args.beam_size):
                        # dec_out: (bz, vocab_size)
                        dec_out, dec_hidden = self.decoder(
                            dec_input[j].unsqueeze(1), dec_hidden, enc_out)
                        dec_hidden *= tgt_batch.non_pad_mask[i].unsqueeze(
                            1).repeat(1, dec_hidden.size(-1))
                        beam_score[j] = dec_out
                    # (bz, beam_size, 1) -> (bz, beam_size, vocab_size)
                    hist_score = hist_score.unsqueeze(-1).expand(
                        (-1, -1, tgt_vocab.vocab_size))
                    hist_score += beam_score.transpose(
                        0, 1)  # (bz, beam_size, vocab_size)
                    # (bz, beam_size * vocab_size)
                    hist_score = hist_score.reshape((batch_size, -1))
                    # (bz, beam_size)
                    top_prob, top_idxs = hist_score.topk(args.beam_size, dim=1)
                    hist_score = top_prob
                    top_idxs %= tgt_vocab.vocab_size
                    best_paths[i] = top_idxs
                    dec_input = top_idxs