Exemplo n.º 1
0
    def setup_model(self):
        self.train_loss = AverageMeter()
        self.network = ConvQA_CN_Net(self.opt, self.dev_lang)
        if self.use_cuda:
            self.log('Using GPU to setup model...')
            self.network.cuda()
        parameters = [p for p in self.network.parameters() if p.requires_grad]

        ## Ranger优化器
        self.optimizer = Ranger(parameters)
        # self.optimizer = AdamW(parameters, lr=3e-5, weight_decay=0.01)
        self.updates = 0
        self.epoch_start = 0
Exemplo n.º 2
0
    def setup_model(self, vocab_embedding):
        self.train_loss = AverageMeter()
        self.network = SDNet(self.opt, vocab_embedding)
        if self.use_cuda:
            self.log('Putting model into GPU')
            self.network.cuda()

        parameters = [p for p in self.network.parameters() if p.requires_grad]
        self.optimizer = optim.Adamax(parameters)
        if 'ADAM2' in self.opt:
            print('ADAM2')
            self.optimizer = optim.Adam(parameters, lr=0.0001)

        self.updates = 0
        self.epoch_start = 0
        self.loss_func = F.cross_entropy
Exemplo n.º 3
0
class SDNetTrainer(BaseTrainer):
    def __init__(self, opt):
        super(SDNetTrainer, self).__init__(opt)
        print('SDNet Model Trainer')
        set_dropout_prob(
            0.0 if not 'DROPOUT' in opt else float(opt['DROPOUT']))
        self.seed = int(opt['SEED'])
        self.data_prefix = 'coqa-'
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.preproc = CoQAPreprocess(self.opt)
        if self.use_cuda:
            torch.cuda.manual_seed_all(self.seed)

    def official(self, model_path, test_data):
        print('-----------------------------------------------')
        print("Initializing model...")
        self.setup_model(self.preproc.train_embedding)
        self.load_model(model_path)

        print("Predicting in batches...")
        test_batches = BatchGen(self.opt,
                                test_data['data'],
                                self.use_cuda,
                                self.preproc.train_vocab,
                                self.preproc.train_char_vocab,
                                evaluation=True)
        predictions = []
        confidence = []
        final_json = []
        cnt = 0
        for j, test_batch in enumerate(test_batches):
            cnt += 1
            if cnt % 50 == 0:
                print(cnt, '/', len(test_batches))
            phrase, phrase_score, pred_json = self.predict(test_batch)
            predictions.extend(phrase)
            confidence.extend(phrase_score)
            final_json.extend(pred_json)

        return predictions, confidence, final_json

    def train(self):
        self.isTrain = True
        self.getSaveFolder()
        self.saveConf()
        self.vocab, self.char_vocab, vocab_embedding = self.preproc.load_data()
        self.log('-----------------------------------------------')
        self.log("Initializing model...")
        self.setup_model(vocab_embedding)

        if 'RESUME' in self.opt:
            model_path = os.path.join(self.opt['datadir'],
                                      self.opt['MODEL_PATH'])
            self.load_model(model_path)

        print('Loading train json...')
        with open(
                os.path.join(self.opt['FEATURE_FOLDER'],
                             self.data_prefix + 'train-preprocessed.json'),
                'r') as f:
            train_data = json.load(f)

        print('Loading dev json...')
        with open(
                os.path.join(self.opt['FEATURE_FOLDER'],
                             self.data_prefix + 'dev-preprocessed.json'),
                'r') as f:
            dev_data = json.load(f)

        best_f1_score = 0.0
        numEpochs = self.opt['EPOCH']
        for epoch in range(self.epoch_start, numEpochs):
            self.log('Epoch {}'.format(epoch))
            self.network.train()
            startTime = datetime.now()
            train_batches = BatchGen(self.opt, train_data['data'],
                                     self.use_cuda, self.vocab,
                                     self.char_vocab)
            dev_batches = BatchGen(self.opt,
                                   dev_data['data'],
                                   self.use_cuda,
                                   self.vocab,
                                   self.char_vocab,
                                   evaluation=True)
            for i, batch in enumerate(train_batches):
                if i == len(train_batches) - 1 or (
                        epoch == 0 and i == 0 and
                    ('RESUME' in self.opt)) or (i > 0 and i % 1500 == 0):
                    print('Saving folder is', self.saveFolder)
                    print('Evaluating on dev set...')
                    predictions = []
                    confidence = []
                    dev_answer = []
                    final_json = []
                    for j, dev_batch in enumerate(dev_batches):
                        phrase, phrase_score, pred_json = self.predict(
                            dev_batch)
                        final_json.extend(pred_json)
                        predictions.extend(phrase)
                        confidence.extend(phrase_score)
                        dev_answer.extend(dev_batch[-3])  # answer_str
                    result, all_f1s = score(predictions, dev_answer,
                                            final_json)
                    f1 = result['f1']

                    if f1 > best_f1_score:
                        model_file = os.path.join(self.saveFolder,
                                                  'best_model.pt')
                        self.save_for_predict(model_file, epoch)
                        best_f1_score = f1
                        pred_json_file = os.path.join(self.saveFolder,
                                                      'prediction.json')
                        with open(pred_json_file, 'w') as output_file:
                            json.dump(final_json, output_file)
                        score_per_instance = []
                        for instance, s in zip(final_json, all_f1s):
                            score_per_instance.append({
                                'id':
                                instance['id'],
                                'turn_id':
                                instance['turn_id'],
                                'f1':
                                s
                            })
                        score_per_instance_json_file = os.path.join(
                            self.saveFolder, 'score_per_instance.json')
                        with open(score_per_instance_json_file,
                                  'w') as output_file:
                            json.dump(score_per_instance, output_file)

                    self.log("Epoch {0} - dev: F1: {1:.3f} (best F1: {2:.3f})".
                             format(epoch, f1, best_f1_score))
                    self.log("Results breakdown\n{0}".format(result))

                self.update(batch)
                if i % 100 == 0:
                    self.log(
                        'updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.
                        format(
                            self.updates, self.train_loss.avg,
                            str((datetime.now() - startTime) / (i + 1) *
                                (len(train_batches) - i - 1)).split('.')[0]))

            print("PROGRESS: {0:.2f}%".format(100.0 * (epoch + 1) / numEpochs))
            print('Config file is at ' + self.opt['confFile'])

    def setup_model(self, vocab_embedding):
        self.train_loss = AverageMeter()
        self.network = SDNet(self.opt, vocab_embedding)
        if self.use_cuda:
            self.log('Putting model into GPU')
            self.network.cuda()

        parameters = [p for p in self.network.parameters() if p.requires_grad]
        self.optimizer = optim.Adamax(parameters)
        if 'ADAM2' in self.opt:
            print('ADAM2')
            self.optimizer = optim.Adam(parameters, lr=0.0001)

        self.updates = 0
        self.epoch_start = 0
        self.loss_func = F.cross_entropy

    def update(self, batch):
        # Train mode
        self.network.train()
        self.network.drop_emb = True

        x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, query, query_mask, \
        query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, ground_truth, context_str, context_words, _, _, _, _ = batch

        # Run forward
        # score_s, score_e: batch x context_word_num
        # score_yes, score_no, score_no_answer: batch x 1
        score_s, score_e, score_yes, score_no, score_no_answer = self.network(
            x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert,
            x_bert_mask, x_bert_offsets, query, query_mask, query_char,
            query_char_mask, query_bert, query_bert_mask, query_bert_offsets,
            len(context_words))
        max_len = self.opt['max_len'] or score_s.size(1)
        batch_size = score_s.shape[0]
        context_len = score_s.size(1)
        expand_score = gen_upper_triangle(score_s, score_e, max_len,
                                          self.use_cuda)
        scores = torch.cat(
            (expand_score, score_no, score_yes, score_no_answer),
            dim=1)  # batch x (context_len * context_len + 3)
        targets = []
        span_idx = int(context_len * context_len)
        for i in range(ground_truth.shape[0]):
            if ground_truth[i][0] == -1 and ground_truth[i][
                    1] == -1:  # no answer
                targets.append(span_idx + 2)
            if ground_truth[i][0] == 0 and ground_truth[i][1] == -1:  # no
                targets.append(span_idx)
            if ground_truth[i][0] == -1 and ground_truth[i][1] == 0:  # yes
                targets.append(span_idx + 1)
            if ground_truth[i][0] != -1 and ground_truth[i][
                    1] != -1:  # normal span
                targets.append(ground_truth[i][0] * context_len +
                               ground_truth[i][1])

        targets = torch.LongTensor(np.array(targets, dtype=int))
        # targets = torch.from_numpy(np.array(targets))
        if self.use_cuda:
            targets = targets.cuda()
        loss = self.loss_func(scores, targets)
        self.train_loss.update(loss.data[0], 1)
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(self.network.parameters(),
                                      self.opt['grad_clipping'])
        self.optimizer.step()
        self.updates += 1
        if 'TUNE_PARTIAL' in self.opt:
            self.network.vocab_embed.weight.data[
                self.opt['tune_partial']:] = self.network.fixed_embedding

    def predict(self, batch):
        self.network.eval()
        self.network.drop_emb = False

        # Run forward
        x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, query, query_mask, \
        query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, ground_truth, context_str, context_words, \
        context_word_offsets, answers, context_id, turn_ids = batch

        context_len = len(context_words)
        score_s, score_e, score_yes, score_no, score_no_answer = self.network(
            x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert,
            x_bert_mask, x_bert_offsets, query, query_mask, query_char,
            query_char_mask, query_bert, query_bert_mask, query_bert_offsets,
            len(context_words))
        batch_size = score_s.shape[0]
        max_len = self.opt['max_len'] or score_s.size(1)

        expand_score = gen_upper_triangle(score_s, score_e, max_len,
                                          self.use_cuda)
        scores = torch.cat(
            (expand_score, score_no, score_yes, score_no_answer),
            dim=1)  # batch x (context_len * context_len + 3)
        prob = F.softmax(
            scores,
            dim=1).data.cpu()  # Transfer to CPU/normal tensors for numpy ops

        # Get argmax text spans
        predictions = []
        confidence = []

        pred_json = []
        for i in range(batch_size):
            _, ids = torch.sort(prob[i, :], descending=True)
            idx = 0
            best_id = ids[idx]

            confidence.append(float(prob[i, best_id]))
            if best_id < context_len * context_len:
                st = best_id / context_len
                ed = best_id % context_len
                st = context_word_offsets[st][0]
                ed = context_word_offsets[ed][1]
                predictions.append(context_str[st:ed])

            if best_id == context_len * context_len:
                predictions.append('no')

            if best_id == context_len * context_len + 1:
                predictions.append('yes')

            if best_id == context_len * context_len + 2:
                predictions.append('unknown')

            pred_json.append({
                'id': context_id,
                'turn_id': turn_ids[i],
                'answer': predictions[-1]
            })

        return (predictions, confidence, pred_json
                )  # list of strings, list of floats, list of jsons

    def load_model(self, model_path):
        print('Loading model from', model_path)
        checkpoint = torch.load(model_path)
        state_dict = checkpoint['state_dict']
        new_state = set(self.network.state_dict().keys())
        for k in list(state_dict['network'].keys()):
            if k not in new_state:
                del state_dict['network'][k]
        for k, v in list(self.network.state_dict().items()):
            if k not in state_dict['network']:
                state_dict['network'][k] = v
        self.network.load_state_dict(state_dict['network'])

        print('Loading finished', model_path)

    def save(self, filename, epoch, prev_filename):
        params = {
            'state_dict': {
                'network': self.network.state_dict(),
                'optimizer': self.optimizer.state_dict(),
                'updates': self.updates  # how many updates
            },
            'train_loss': {
                'val': self.train_loss.val,
                'avg': self.train_loss.avg,
                'sum': self.train_loss.sum,
                'count': self.train_loss.count
            },
            'config': self.opt,
            'epoch': epoch
        }
        try:
            torch.save(params, filename)
            self.log('model saved to {}'.format(filename))
            if os.path.exists(prev_filename):
                os.remove(prev_filename)
        except BaseException:
            self.log('[ WARN: Saving failed... continuing anyway. ]')

    def save_for_predict(self, filename, epoch):
        network_state = dict([(k, v)
                              for k, v in self.network.state_dict().items()
                              if k[0:4] != 'CoVe' and k[0:4] != 'ELMo'
                              and k[0:9] != 'AllenELMo' and k[0:4] != 'Bert'])

        if 'eval_embed.weight' in network_state:
            del network_state['eval_embed.weight']
        if 'fixed_embedding' in network_state:
            del network_state['fixed_embedding']
        params = {
            'state_dict': {
                'network': network_state
            },
            'config': self.opt,
        }
        try:
            torch.save(params, filename)
            self.log('model saved to {}'.format(filename))
        except BaseException:
            self.log('[ WARN: Saving failed... continuing anyway. ]')
class SDNetTrainer(BaseTrainer):
    def __init__(self, opt):
        super(SDNetTrainer, self).__init__(opt)
        print('SDNet Model Trainer')
        set_dropout_prob(
            0.0 if not 'DROPOUT' in opt else float(opt['DROPOUT']))
        self.seed = int(opt['SEED'])
        self.data_prefix = 'coqa-'
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.preproc = CoQAPreprocess(self.opt)
        if self.use_cuda:
            torch.cuda.manual_seed_all(self.seed)

    def offical(self, model_path, test_data):
        print('-----------------------------------------------')
        print('Initializing model...')
        self.setup_model(self.preproc.train_embedding)
        self.load_model(model_path)

        print('Predicting in batches...')
        test_batches = BatchGen(self.opt,
                                test_data['data'],
                                self.use_cuda,
                                self.preproc.train_vocab,
                                self.preproc.train_char_vocab,
                                evaluation=True)
        predictions = []
        confidence = []
        final_json = []
        cnt = 0
        for j, test_batch in enumerate(test_batches):
            cnt += 1
            if cnt % 50 == 0:
                print(cnt, '/', len(test_batches))
            phrase, phrase_score, pred_json = self.predict(test_batch)
            predictions.extend(phrase)  # 在已知列表中追加新的内容
            confidence.extend(phrase_score)
            final_json.extend(pred_json)
        return predictions, confidence, final_json

    def setup_model(self, vocab_embedding):  # 初始化模型
        self.train_loss = AverageMeter()
        self.network = SDNet(self.opt, vocab_embedding)
        if self.use_cuda:
            self.log('Putting model to GPU')
            self.network.cuda()

        parameters = [p for p in self.network.parameters() if p.requires_grad]
        self.optimizer = optim.Adamax(params=parameters)
        if 'ADAM2' in self.opt:
            print('ADAM2')
            self.optimizer = optim.Adam(params=parameters, lr=0.0001)

        self.updates = 0
        self.epoch_start = 0
        self.loss_func = F.cross_entropy

    def load_model(self, model_path):
        print('Loading model from', model_path)
        checkpoint = torch.load(model_path)
        state_dict = checkpoint['state_dict']
        new_state = set(self.network.state_dict().keys())
        for k in list(state_dict['network'].keys()):
            if k not in new_state:
                del state_dict['network'][k]
        for k, v in list(self.network.state_dict().items()):
            if k not in state_dict['network']:
                state_dict['network'][k] = v
        self.network.load_state_dict(state_dict['network'])
        print('Loading finished', model_path)

    def save(self, filename, epoch, prev_filename):
        params = {
            'state_dict': {
                'network': self.network.state_dict(),
                'optimizer': self.optimizer.state_dict(),
                'updates': self.updates
            },
            'train_loss': {
                'val': self.train_loss.val,
                'avg': self.train_loss.avg,
                'sum': self.train_loss.sum,
                'count': self.train_loss.count
            },
            'config': self.opt,
            'epoch': epoch
        }
        try:
            torch.save(params, filename)
            self.log('model saved to {}'.format(filename))
            if os.path.exists(prev_filename):
                os.remove(prev_filename)
        except BaseException:
            self.log(['WARN: Saving failed... continuing anyway.'])

    def save_for_predict(self, filename, epoch):
        network_state = dict([(k, v)
                              for k, v in self.network.state_dict().items()
                              if k[0:4] != 'CoVe' and k[0:4] != 'ELMo'
                              and k[0:9] != 'AllenELMo' and k[0:4] != 'Bert'])
        if 'eval_embed.weight' in network_state:
            del network_state['eval_embed.weight']
        if 'fixed_embedding' in network_state:
            del network_state['fixed_embedding']
        params = {'state_dict': {'network': network_state}, 'config': self.opt}
        try:
            torch.save(params, filename)
            self.log('model saved to {}'.format(filename))
        except BaseException:
            self.log(['WARN: Saving failed...continuing anyway.'])

    def update(self, batch):
        """
            训练函数train()最后调用了前向计算函数update()。该函数根据批次数据batch中的内容直接与SDNet网络代码对接,
            进行一次前向计算,然后计算交叉熵损失函数,利用PyTorch自带的反向传播函数backward求导并更新参数。由于
            CoQA任务的答案可能是文章中的一段区间,也有可能是“是/否/没有答案”,因此update()对所有概率进行统一处理:
            如果文章中有m个单词,update()根据网络输出层结果生成一个长度为m^2+3的向量SCOYes,表示答案是各种可能的
            文章区间与3种特殊情况的概率。
        """
        self.network.train(1)  # 进入训练模式
        self.network.drop_emb = True
        # 从batch中获得文章、问题、答案的所有信息,包括单词编号、词性标注、BERT分词编号等
        x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, \
        query, query_mask, query_bert, query_char, query_char_mask, query_bert_mask, query_bert_offsets, \
        ground_truth, context_str, context_words, _, _, _, _ = batch
        # 进行前向计算,获得模型预测答案
        # 1) 在文本每个位置开始和结束的概率score_s, score_e
        # 2) 是Yes/No/No answer的概率为score_yes, score_no, score_no_answer
        # 其中score_s和score_e的维度为batch * context_word_num, score_yes, score_no, score_no_answer的维度均为batch * 1
        score_s, score_e, score_yes, score_no, score_no_answer = self.network(
            x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert,
            x_bert_mask, x_bert_offsets, query, query_mask, query_char,
            query_char_mask, query_bert, query_bert_mask, query_bert_offsets,
            len(context_words))
        # 答案最长长度在配置文件中有定义
        max_len = self.opt['max_len'] or score_s.size(1)
        batch_size = score_s.shape[0]
        context_len = score_s.size(1)
        expand_score = gen_upper_triangle(score_s, score_e, max_len,
                                          self.use_cuda)  # 区间答案的概率
        # 将区间答案的概率与否/是/没有答案进行拼接
        scores = torch.cat(
            (expand_score, score_no, score_yes, score_no_answer),
            dim=1)  # batch * (context_len * context_len + 3)
        # 标准答案的位置为转化成一维坐标,与expand_score对齐。例如:答案区间[3, 5]变成3*m+5, 否变成m*m, 是变成m*m+1, 没有答案变成m*m+2
        targets = []
        span_idx = int(context_len * context_len)
        for i in range(ground_truth.shape[0]):
            if ground_truth[i][0] == -1 and ground_truth[i][1] == -1:  # 没有答案
                targets.append(span_idx + 2)
            if ground_truth[i][0] == 0 and ground_truth[i][1] == -1:  # 否
                targets.append(span_idx)
            if ground_truth[i][0] == -1 and ground_truth[i][1] == 0:  # 是
                targets.append(span_idx + 1)
            if ground_truth[i][0] != -1 and ground_truth[i][1] != -1:  # 区间
                targets.append(ground_truth[i][0] * context_len +
                               ground_truth[i][1])

        targets = torch.LongTensor(np.array(targets))
        if self.use_cuda:
            targets = targets.cuda()
        loss = self.loss_func(input=scores, target=targets)  # 计算交叉熵损失函数
        self.train_loss.update(loss.data[0], 1)
        self.optimizer.zero_grad()  # 优化器将所有导数清零
        loss.backward()  # 利用PyTorch自带的反向传播函数求导
        torch.nn.utils.clip_grad_norm(parameters=self.network.parameters(),
                                      max_norm=self.opt['grad_clipping'])
        self.optimizer.step()  # 更新参数
        self.updates += 1
        if 'TUNE_PARTIAL' in self.opt:
            self.network.vocab_embed.weight.data[
                self.opt['tune_partial']:] = self.network.fixed_embedding

    def predict(self, batch):
        """
            SDNet每更新1500个batch就会利用测试函数predict()在验证集上预测答案并计算准确率得分。predict()函数
            的流程与update()函数类似,也要进行一次前向计算得到网络输出结果。之后,模型在所有可能的答案中选择概率
            最大的作为预测结果。最终输出包括预测答案和对应的概率,并按照CoQA的要求输出JSON格式的结果。
        """
        self.network.eval()  # 将网络设置成测试模式,即不计算导数、不进行Dropout等操作
        self.network.drop_emb = False
        # 与update()函数类似,前向计算得到网络预测结果
        x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, \
        query, query_mask, query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, \
        ground_truth, context_str, context_words, context_word_offsets, answers, context_id, turn_ids = batch
        context_len = len(context_words)
        score_s, score_e, score_yes, score_no, score_no_answer = self.network(
            x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert,
            x_bert_mask, x_bert_offsets, query, query_mask, query_char,
            query_char_mask, query_bert, query_bert_mask, query_bert_offsets,
            len(context_words))
        batch_size = score_s.shape[0]
        max_len = self.opt['max_len'] or score_s.size(1)
        # 与update()函数类似,得到大小为m*m+3的一维概率向量
        expand_score = gen_upper_triangle(score_s, score_e, max_len,
                                          self.use_cuda)
        scores = torch.cat(
            (expand_score, score_no, score_yes, score_no_answer),
            dim=1)  # batch * (m * m + 3)
        prob = F.softmax(scores, dim=1).data.cpu()  # 将结果存入CPU,方便NumPy操作
        predictions = []  # 存储预测的答案字符串
        confidence = []  # 存储预测的概率
        pred_json = []  # 存储JSON格式的答案
        for i in range(batch_size):
            _, ids = torch.sort(prob[i, :],
                                descending=True)  # 对第i个答案的所有可能解的概率从大到小排序,只取索引
            idx = 0
            best_id = ids[idx]  # best_id是概率最大答案的下标,在0到m*m+2之间
            confidence.append(float(prob[i, best_id]))
            # 处理答案是区间的情况,将best_id还原成开始位置st和结束位置ed
            if best_id < context_len * context_len:
                st = best_id / context_len
                ed = best_id % context_len
                # context_word_offsets提供每个词的第一个字符和最后一个字符在文章中的位置
                st = context_word_offsets[st][0]
                ed = context_word_offsets[ed][1]
                # 获得预测的答案字符串
                predictions.append(context_str[st:ed])
            # 处理答案为“否”的情况
            if best_id == context_len * context_len:
                predictions.append('no')
            # 处理答案为“是”的情况
            if best_id == context_len * context_len + 1:
                predictions.append('yes')
            # 处理“没有答案”的情况
            if best_id == context_len * context_len + 2:
                predictions.append('unknown')
            # 记录JSON格式的输出
            pred_json.append({
                'id': context_id,
                'turn_id': turn_ids[i],
                'answer': predictions[-1]
            })
        return (predictions, confidence, pred_json
                )  # list of strings, list of floats, list of jsons

    def train(self):
        """
            train()函数进行批次处理,即对于一个batch的数据,计算当前预测结果并求导更新参数。
            每训练1500个batch,利用predict()函数在验证数据上进行一次预测并计算准确率得分。
            当前得分最高的模型参数保存在run_id文件夹中。
        """
        self.isTrain = True  # 标记训练模式
        self.getSaveFolder()
        self.saveConf()
        self.vocab, self.char_vocab, vocab_embedding = self.preproc.load_data(
        )  # 从CoQAPreprocess中获得词表和编码
        self.log('-----------------------------------------------')
        self.log('Initializing model...')
        self.setup_model(vocab_embedding)  # 初始化模型

        if 'RESUME' in self.opt:  # 在继续训练模式时,读取之前存储的模型
            model_path = os.path.join(self.opt['datadir'],
                                      self.opt['MODEL_PATH'])
            self.load_model(model_path)

        print('Loading train json')  # 读取处理好的训练数据
        with open(
                os.path.join(self.opt['FEATURE_FOLDER'],
                             self.data_prefix + 'train-preprocessed.json'),
                'r') as f:
            train_data = json.load(f)

        print('Loading dev json')  # 读取处理好的验证数据
        with open(
                os.path.join(self.opt['FEATURE_FOLDER'],
                             self.data_prefix + 'dev-preprocessed.json'),
                'r') as f:
            dev_data = json.load(f)

        best_f1_score = 0.0  # 训练中得到的验证集上的最高的F1得分
        numEpochs = self.opt['EPOCH']  # 配置文件中EPOCH为训练轮数
        for epoch in range(self.epoch_start, numEpochs):
            self.log('Epoch {}'.format(epoch))
            # 训练模式,开启Dropout等功能
            self.network.train()
            startTime = datetime.now()
            # 获得训练数据的batch迭代器
            train_batches = BatchGen(self.opt, train_data['data'],
                                     self.use_cuda, self.vocab,
                                     self.char_vocab)
            # 获得验证数据的batch迭代器
            dev_batches = BatchGen(self.opt,
                                   dev_data['data'],
                                   self.use_cuda,
                                   self.vocab,
                                   self.char_vocab,
                                   evaluation=True)
            for i, batch in enumerate(train_batches):
                # 每轮结束时或继续训练模式的第一个batch或每1500个batch,在验证数据上预测并计算得分
                if i == len(train_batches) - 1 or (
                        epoch == 0 and i == 0 and
                    ('RESUME' in self.opt)) or (i > 0 and i % 1500 == 0):
                    print('Saving folder is', self.saveFolder)
                    print('Evaluating on dev set...')
                    predictions = []
                    confidence = []
                    dev_answer = []
                    final_json = []
                    for j, dev_batch in enumerate(dev_batches):
                        # 预测的结果包括答案文本、答案可能性打分以及JSON格式结果
                        phrase, phrase_score, pred_json = self.predict(
                            dev_batch)
                        final_json.extend(pred_json)
                        predictions.extend(phrase)
                        confidence.extend(phrase_score)
                        dev_answer.extend(dev_batch[-3])  # answer_str
                    # 计算精确匹配EM和F1得分
                    result, all_f1s = score(pred=predictions,
                                            truth=dev_answer,
                                            final_json=final_json)
                    f1 = result['f1']
                    # 如果F1得分高于之前的所有模型,则存储此模型
                    if f1 > best_f1_score:
                        model_file = os.path.join(self.saveFolder,
                                                  'best_model.pt')
                        self.save_for_predict(model_file, epoch)
                        best_f1_score = f1
                        pred_json_file = os.path.join(self.saveFolder,
                                                      'prediction.json')
                        with open(pred_json_file, 'w') as output_file:
                            json.dump(final_json, output_file)
                        score_per_instance = []
                        for instance, s in zip(final_json, all_f1s):
                            score_per_instance.append({
                                'id':
                                instance['id'],
                                'turn_id':
                                instance['turn_id'],
                                'f1':
                                s
                            })
                        score_per_instance_json_file = os.path.join(
                            self.saveFolder, 'score_per_instance.json')
                        with open(score_per_instance_json_file,
                                  'w') as output_file:
                            json.dump(score_per_instance, output_file)
                    self.log('Epoch {0} - dev F1: {1:.3f} (best F1: {2:.3f})'.
                             format(epoch, f1, best_f1_score))
                    self.log('Results breakdown\n{0}'.format(result))
                # 对本批次进行计算、求导和参数更新
                self.update(batch)
                if i % 100 == 0:
                    self.log(
                        'updates[{0: 6}] train loss[{1: .5f}] remaining[{2}]'.
                        format(
                            self.updates, self.train_loss.avg,
                            str((datetime.now() - startTime) / (i + 1) *
                                (len(train_batches) - i - 1)).split('.')[0]))
                print('PROGRESS: {0:.2F}%'.format(100.0 * (epoch + 1) /
                                                  numEpochs))
                print('Config file is at ' + self.opt['confFile'])
Exemplo n.º 5
0
class ConvQA_CN_NetTrainer(BaseTrainer):
    def __init__(self, opt):
        super(ConvQA_CN_NetTrainer, self).__init__(opt)
        print('Model Trainer')
        set_dropout_prob(0.0 if not 'DROPOUT' in opt else float(opt['DROPOUT']))
        self.seed = int(opt['SEED'])
        self.opt = opt
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        if self.opt['dataset'] == 'quac':
            self.data_prefix = 'quac-'
            self.preproc = QuACPreprocess(self.opt)
        if self.use_cuda:
            torch.cuda.manual_seed_all(self.seed)

        ### seq2seq
        self.train_lang, self.dev_lang = dataprocess("train", "dev")
        self.opt['train_words'] = self.train_lang.n_words
        self.opt['dev_words'] = self.dev_lang.n_words

    def train(self):
        self.getSaveFolder()
        self.saveConf()
        self.result_file = self.opt['RESULT_FILE']
        self.log('-----------------------------------------------')
        self.log("Initializing model...")
        self.setup_model()

        if 'CHECK_POINT' in self.opt:
            model_path = os.path.join(self.opt['datadir'], self.opt['CHECK_POINT_PATH'])
            self.load_model(model_path)

        print('Loaing train json...')
        with open(os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'train-preprocessed.json'), 'r') as f:
            train_data = json.load(f)

        with open(os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'dev-preprocessed.json'), 'r') as f:
            dev_data = json.load(f)

        output_prediction_file = self.opt['OUTPUT_FILE'] + "prediction_file.json"
        best_f1_score = 0
        last_epoch = 0
        num_epochs = self.opt['EPOCH']
        # self.scheduler = CyclicLRWithRestarts(self.optimizer, batch_size, num_epochs, restart_period=5, t_mult=1.2,
        #                                  policy="cosine")
        for epoch in range(self.epoch_start, num_epochs):
            ### best_f1_score记录每个epoch里的最优值
            self.log('\n########Epoch {}########\n'.format(epoch))
            # self.network.train()
            start_time = datetime.now()
            train_batches = BatchGen(self.opt, train_data['data'], self.use_cuda, is_training=True)
            dev_batches = BatchGen(self.opt, dev_data['data'], self.use_cuda, is_training=False)
            # self.scheduler.step()
            ### step = 2700
            for i, batch in enumerate(train_batches):

                ''' 先判断是否进入测试阶段
                三个条件:
                    1.正常训练即将结束
                    2.训练刚开始,载入Check point
                    3.每1600步测试一次(参数可调)
                '''
                # if i == len(train_batches) - 1 or (epoch == 0 and i == 0 and ('CHECK_POINT' in self.opt)) or (i ==1800):
                # if (self.updates >= 0 and self.updates % 5000 == 0):
                # if self.updates>0 and self.updates%1000==0:
                if self.updates > 0 and self.updates % 1000 == 0:

                    print('Saving folder is', self.saveFolder)
                    print('Evaluating on dev set......')

                    final_json, all_predictions_list, all_nbest_json_list = [], [], []
                    results = {}
                    count = 0
                    for j, dev_batch in enumerate(dev_batches):
                        pred_json, all_predictions, all_nbest_json = self.predict(dev_batch)
                        count += len(pred_json)
                        final_json.append(pred_json)
                        all_predictions_list += all_predictions

                    with open(output_prediction_file, "w") as writer:
                        writer.write(json.dumps(all_predictions_list, indent=4) + "\n")
                    with open(self.opt['Quac_DEV_FILE'], 'r') as f:
                        val_file = json.load(f)
                    val_file = val_file['data']

                    new = {}
                    for r in all_predictions_list:
                        tmp = {r['turn_id']: [r['answer'], 'y', 'y']}
                        if r['id'] in new:
                            new[r['id']][r['turn_id']] = [r['answer'], 'y', 'y']
                        else:
                            new[r['id']] = {}
                            new[r['id']][r['turn_id']] = [r['answer'], 'y', 'y']
                            
                    metric_json = eval_fn(val_file, new, False)
                    # logger.info("Results: {}".format(results))
                    final_f1 = metric_json['f1']
                    # pdb.set_trace()
                    if best_f1_score != 0:
                        print("Best F1 : {}".format(max(final_f1, best_f1_score)))
                    # print("dev loss: ", final_loss)

                    if final_f1>best_f1_score:
                        model_file = os.path.join(self.result_file, 'best_model.pt')
                        self.save_for_predict(model_file, epoch)
                        best_f1_score = final_f1
                        pred_json_file = os.path.join(self.result_file, 'prediction.json')
                        with open(pred_json_file, 'w', encoding='utf-8') as output_file:
                            json.dump(final_json, output_file, ensure_ascii=False)
                        # with open(pred_json_file, 'w', encoding='utf-8') as result_file:
                        #     json.dump("f1: {}".format(final_f1), result_file, ensure_ascii=False)
                        score_per_instance = []

                        ### 可以确定len(all_f1) = len(final_json)
                        for instance in final_json:
                            score_per_instance.append({
                                'id': instance[0]['turn_id'],
                                'turn_id': instance[0]['id']})

                        score_per_instance_json_file = os.path.join(self.result_file, 'score_per_instance.json')
                        with open(score_per_instance_json_file, 'w') as output_file:
                            json.dump(score_per_instance, output_file)

                    self.log("Epoch {0} - dev: F1: {1:.3f} (best F1: {2:.3f})\n".format(epoch, final_f1, best_f1_score))
                    # self.log("Results breakdown\n{0}".format(result))
                # if self.updates<200:
                #     # print(self.updates)
                #     self.updates += 1
                #     continue
                self.update(batch)
                if i % 100 == 0:
                    self.log('**********************EPOCH[{0:2}] i[{1:4}] updates[{2:6}] train loss[{3:.5f}] remaining[{4}]'.format(
                        epoch, i, self.updates, self.train_loss.avg,
                        str((datetime.now() - start_time) / (i + 1) * (len(train_batches) - i - 1)).split('.')[0]))

            print("PROGRESS: {0:.2f}%".format(100.0 * (epoch + 1) / num_epochs))
            print('Config file is at ' + self.opt['confFile'])

    def setup_model(self):
        self.train_loss = AverageMeter()
        self.network = ConvQA_CN_Net(self.opt, self.dev_lang)
        if self.use_cuda:
            self.log('Using GPU to setup model...')
            self.network.cuda()
        parameters = [p for p in self.network.parameters() if p.requires_grad]

        ## Ranger优化器
        self.optimizer = Ranger(parameters)
        # self.optimizer = AdamW(parameters, lr=3e-5, weight_decay=0.01)
        self.updates = 0
        self.epoch_start = 0
        # self.loss_func = F.cross_entropy

    def update(self, batch):

        self.network.train()
        self.network.drop_emb = True

        use_his = True
        x, x_offsets, x_bert_mask, rational_mask, x_sep, q, q_mask, ground_truth, context_str, \
            context_word_offsets, ex_pre_answer_strs, is_max_context, token_to_orig_map, answer_types, cls_idx, input_answer_strs, context_ids, turn_ids, his_inf_list, followup_list, yesno_list = batch

        truth = []
        for i in range(len(ground_truth)):
            tmp = torch.LongTensor(ground_truth[i])
            tmp = torch.unsqueeze(tmp, 0)
            truth.append(tmp)
        ground_truth = torch.cat(truth)

        ### forward
        loss = self.network(x, x_bert_mask, rational_mask, x_sep, q, q_mask, his_inf_list, input_answer_strs, ex_pre_answer_strs, ground_truth, context_str, context_ids, turn_ids, answer_types, cls_idx, is_max_context, token_to_orig_map, followup_list, yesno_list, True)

        self.train_loss.update(loss.item(), 1)
        self.optimizer.zero_grad()
        loss.backward()
        # tmp = []
        # for name, p in self.network.named_parameters():
        #     a = p.grad
        #     tmp.append((name, a))

        # for name, param in self.network.named_parameters():
        #     a = param.view(1,-1).squeeze(0).tolist()
        #     if sum(a)==0:
        #         print(param.size())
        #         print(name)
        #         print(param)
        torch.nn.utils.clip_grad_norm_(self.network.parameters(), self.opt['grad_clipping'])
        self.optimizer.step()
        # self.scheduler.batch_step()
        self.updates += 1


    def predict(self, batch):
        self.network.eval()
        self.network.drop_emb = False

        x, x_offsets, x_bert_mask, rational_mask, x_sep, q, q_mask, ground_truth, context_str, \
            context_word_offsets, ex_pre_answer_strs, is_max_context, token_to_orig_map, answer_types, cls_idx, input_answer_strs, context_ids, turn_ids, his_inf_list, followup_list, yesno_list = batch

        pred_json, all_predictions, all_nbest_json= self.network(x, x_bert_mask, rational_mask, x_sep, q, q_mask, his_inf_list, input_answer_strs, ex_pre_answer_strs, ground_truth, context_str, context_ids, turn_ids, answer_types, cls_idx, is_max_context, token_to_orig_map, followup_list, yesno_list, False)

        return pred_json, all_predictions, all_nbest_json


    def load_model(self, model_path):
        print('Loading model from', model_path)
        checkpoint = torch.load(model_path)
        state_dict = checkpoint['state_dict']
        new_state = set(self.network.state_dict().keys())
        for k in list(state_dict['network'].keys()):
            if k not in new_state:
                del state_dict['network'][k]
        for k, v in list(self.network.state_dict().items()):
            if k not in state_dict['network']:
                state_dict['network'][k] = v
        self.network.load_state_dict(state_dict['network'])

        print('Loading finished', model_path)

    def save(self, filename, epoch, prev_filename):
        params = {
            'state_dict': {
                'network': self.network.state_dict(),
                'optimizer': self.optimizer.state_dict(),
                'updates': self.updates  # how many updates
            },
            'train_loss': {
                'val': self.train_loss.val,
                'avg': self.train_loss.avg,
                'sum': self.train_loss.sum,
                'count': self.train_loss.count
            },
            'config': self.opt,
            'epoch': epoch
        }
        try:
            torch.save(params, filename)
            self.log('model saved to {}'.format(filename))
            if os.path.exists(prev_filename):
                os.remove(prev_filename)
        except BaseException:
            self.log('[ WARN: Saving failed... continuing anyway. ]')

    def save_for_predict(self, filename, epoch):
        network_state = dict([(k, v) for k, v in self.network.state_dict().items() if
                              k[0:4] != 'CoVe' and k[0:4] != 'ELMo' and k[0:9] != 'AllenELMo' and k[0:4] != 'Bert'])

        if 'eval_embed.weight' in network_state:
            del network_state['eval_embed.weight']
        if 'fixed_embedding' in network_state:
            del network_state['fixed_embedding']
        params = {
            'state_dict': {'network': network_state},
            'config': self.opt,
        }
        try:
            torch.save(params, filename)
            self.log('model saved to {}'.format(filename))
        except BaseException:
            self.log('[ WARN: Saving failed... continuing anyway. ]')

    def process_ans(self, ans):
        ans = ans.replace(" , ", ", ")
        ans = ans.replace(" . ", ". ")
        ans = ans.replace(" ? ", "? ")
        ans = ans.replace("^ ", "")
        ans = ans.replace(" ^ ", "")
        ans = ans.replace("? ^ ", "")
        return ans

    def _get_best_indexes(self, logits, n_best_size):
        """Get the n-best logits from a list."""
        index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)

        best_indexes = []
        for i in range(len(index_and_score)):
            if i >= n_best_size:
                break
            best_indexes.append(index_and_score[i][0])
        return best_indexes

    def normalize_answer(self, s):
        """Lower text and remove punctuation, articles and extra whitespace."""

        def remove_articles(text):
            return re.sub(r'\b(a|an|the)\b', ' ', text)

        def white_space_fix(text):
            return ' '.join(text.split())

        def remove_punc(text):
            exclude = set(string.punctuation)
            return ''.join(ch for ch in text if ch not in exclude)

        def lower(text):
            return text.lower()

        return white_space_fix(remove_articles(remove_punc(lower(s))))

    def f1_score(self, prediction, ground_truth):
        prediction_tokens = self.normalize_answer(prediction).split()
        ground_truth_tokens = self.normalize_answer(ground_truth).split()
        common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
        num_same = sum(common.values())
        if num_same == 0:
            return 0
        precision = 1.0 * num_same / len(prediction_tokens)
        recall = 1.0 * num_same / len(ground_truth_tokens)
        f1 = (2 * precision * recall) / (precision + recall)
        return f1