Пример #1
0
 def __init__(self, kb=True):
     super(MultiViewTestAgent, self).__init__(kb=kb)
     self.args = {'talk_samples': 128, 'topic_threshold': 0.5}
     from multiview import MultiView
     print(f'[!] MultiView reranker model will be initized')
     self.reranker = MultiView(
             topic=True,
             length=True,
             nidf_tf=True,
             coherence=True,
             fluency=True,
             repetition_penalty=True,
             mmi=True,
             distinct=True,
             mmi_path='ckpt/train_generative/gpt2_mmi/best.pt',
             coherence_path='ckpt/train_retrieval/bertretrieval/best.pt',
             topic_path='ckpt/fasttext/model.bin',
             fluency_path='ckpt/LM/gpt2lm/best.pt',
             )
     print(f'[!] load multiview model over')
Пример #2
0
def single_multiview_exp(data_str, course_num, model_str, fold, skill_dim,
                         concept_dim, lambda_s, lambda_t, lambda_q, lambda_e,
                         lambda_bias, penalty_weight, markovian,
                         trade_off_example, max_iter, lr, log_file):
    """
    pipeline of running single experiment with 5-fold cross validation
    :para: a list of parameters for a single case of experiment
    :return:
    """

    with open(
            'data/{}/{}/{}_train_test.pkl'.format(data_str, course_num, fold),
            'rb') as f:
        data = pickle.load(f)
        print(data.keys())

    model_config = config(data,
                          skill_dim,
                          concept_dim,
                          lambda_s,
                          lambda_t,
                          lambda_q,
                          lambda_bias,
                          penalty_weight,
                          markovian_steps=markovian,
                          lambda_e=lambda_e,
                          lr=lr,
                          max_iter=max_iter,
                          trade_off_example=trade_off_example,
                          log_file=log_file)
    test_set = model_config['test']
    if model_str == 'multiview':
        model = MultiView(model_config)
    else:
        raise EnvironmentError("ERROR!!")

    print(model.train_data)
    # find the first testing attempt, and add all examples before test_start_attempt into train_data
    test_start_attempt = None
    for (stud, att, index, obs, res) in sorted(test_set, key=lambda x: x[1]):
        if res == 0:
            test_start_attempt = att
            break
        else:
            model.train_data.append((stud, att, index, obs, res))
    if None == test_start_attempt:
        raise EnvironmentError

    total_test_count = 0
    sum_square_error, sum_abs_error = [0.] * 2
    for attempt in range(test_start_attempt, model.num_attempts):
        # train, and then predict the obs at current attempt
        model.current_test_attempt = attempt
        model.lr = lr
        model.training()
        test_data = []
        for (stud, att, index, obs, res) in test_set:
            if att == model.current_test_attempt:
                test_data.append((stud, att, index, obs, res))
                model.train_data.append((stud, att, index, obs, res))

                for i in range(max(0, att - model.markovian_steps), att):
                    model.train_data_markovian.append((stud, i, index, res))
                upper_steps = min(model.num_attempts,
                                  attempt + model.markovian_steps + 1)
                for j in range(attempt + 1, upper_steps):
                    model.train_data_markovian.append((stud, j, index, res))

        test_perf = model.testing(test_data)
        # re-initialize the bias for each attempt, student, question, example
        if attempt != model.num_attempts - 1:
            model.bias_s = np.zeros(model.num_users)
            model.bias_q = np.zeros(model.num_questions)
            model.bias_a = np.zeros(model.num_attempts)

        # if attempt not in perf_dict[fold]:
        #     perf_dict[fold][attempt] = test_perf
        test_count, _rmse, _mae = test_perf

        # cumulative all metrics over all attempts
        sum_square_error += (_rmse**2) * test_count
        sum_abs_error += _mae * test_count
        total_test_count += test_count

    rmse = np.sqrt(sum_square_error / total_test_count)
    mae = sum_abs_error / total_test_count

    dir_path = "saved_models/{}/{}/{}/".format(data_str, course_num, model_str)
    if not os.path.exists(dir_path):
        pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True)
    file_name = "fold_{}_skill_{}_concept_{}_ls_{}_lt_{}_lq_{}_le_{}_lbias_{}_pw_{}_" \
                "markov_{}_tradeoff_{}_lr_{}_max_iter_{}_model.pkl".format(fold, skill_dim,
                                                                           concept_dim, lambda_s,
                                                                           lambda_t, lambda_q,
                                                                           lambda_e, lambda_bias,
                                                                           penalty_weight,
                                                                           markovian,
                                                                           trade_off_example, lr,
                                                                           max_iter)

    file_path = dir_path + file_name
    pickle.dump(model, open(file_path, "wb"))
    return [fold, total_test_count, rmse, mae]
Пример #3
0
    def __init__(self,
                 total_steps,
                 multi_gpu,
                 vocab_file='data/vocab/vocab_small',
                 run_mode='train',
                 lang='zh',
                 lm=False):
        super(GPT2Agent, self).__init__()
        # hyperparameters
        try:
            # self.gpu_ids = [int(i) for i in multi_gpu.split(',')]
            self.gpu_ids = list(range(len(multi_gpu.split(','))))
        except:
            raise Exception(
                f'[!] multi gpu ids are needed, but got: {multi_gpu}')
        assert run_mode in [
            'train', 'test', 'rerank', 'rerank_ir'
        ], f'[!] running mode must be train or test, but got {run_mode}'
        vocab_file = 'data/vocab/vocab_small' if lang == 'zh' else 'data/vocab/vocab_english'
        lr = 1 if lm else 1.5e-4
        self.args = {
            'lr': lr,
            'grad_clip': 1.0,
            'pad': 0,
            'tgt_len_size': 50,
            'lr_gamma': 0.5,
            'patience': 5,
            'min_lr': 1e-5,
            'warmup_steps': 2000,
            'total_steps': total_steps,
            'topk': 20,
            'topp': 1.0,
            'config_path': 'data/config/model_config_dialogue_big.json',
            'multi_gpu': self.gpu_ids,
            'run_mode': run_mode,
            'vocab_file': vocab_file,
            'lang': lang,
            'topic_transfer': {
                '音乐': 'music',
                '体育': 'sport',
                '数码产品': 'electric',
                '美食': 'food',
                '电影': 'movie'
            },
            'balanceddata_parallel_gpu0_size': 2,
            'repetition_penalty': 1,
        }
        # hyperparameters

        self.vocab = BertTokenizer(vocab_file=self.args['vocab_file'])
        self.vocab_size = len(self.vocab)
        self.unk = self.vocab.convert_tokens_to_ids('[UNK]')
        self.sep = self.vocab.convert_tokens_to_ids('[SEP]')
        self.cls = self.vocab.convert_tokens_to_ids('[CLS]')

        self.model = GPT2(self.vocab_size,
                          self.unk,
                          self.sep,
                          self.args['topk'],
                          self.args['topp'],
                          self.args['repetition_penalty'],
                          config_path=self.args['config_path'])

        self.criterion = nn.CrossEntropyLoss(ignore_index=self.args['pad'],
                                             reduction='sum')
        self.optimizer = transformers.AdamW(self.model.parameters(),
                                            lr=self.args['lr'],
                                            correct_bias=True)

        # need to obtain the whole iter
        self.warmup_scheduler = transformers.get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=self.args['warmup_steps'],
            num_training_steps=self.args['total_steps'])
        if torch.cuda.is_available():
            self.model.cuda()

        # train: DataParallel; test: no DataParallel
        if self.args['run_mode'] == 'train':
            self.model = DataParallel(self.model, device_ids=self.gpu_ids)
            # self.model = BalancedDataParallel(
            #         self.args['balanceddata_parallel_gpu0_size'],
            #         self.model,
            #         dim=0)

        # run_mode == 'chatbot', use the bertretrieval for reranking
        if run_mode in ['rerank', 'rerank_ir']:
            from multiview import MultiView
            print(f'[!] MultiView reranker model will be initized')
            self.reranker = MultiView(
                topic=True,
                length=True,
                nidf_tf=True,
                coherence=True,
                fluency=True,
                repetition_penalty=True,
                mmi=True,
                distinct=True,
                mmi_path='ckpt/train_generative/gpt2_mmi/best.pt',
                coherence_path='ckpt/train_retrieval/bertretrieval/best.pt',
                topic_path='ckpt/fasttext/model.bin',
                fluency_path='ckpt/LM/gpt2lm/best.pt',
            )
            print(f'[!] load multiview model over')

        if run_mode == 'rerank_ir':
            self.ir_agent = TestAgent()

        self.show_parameters(self.args)
Пример #4
0
class GPT2Agent(BaseAgent):
    def __init__(self,
                 total_steps,
                 multi_gpu,
                 vocab_file='data/vocab/vocab_small',
                 run_mode='train',
                 lang='zh',
                 lm=False):
        super(GPT2Agent, self).__init__()
        # hyperparameters
        try:
            # self.gpu_ids = [int(i) for i in multi_gpu.split(',')]
            self.gpu_ids = list(range(len(multi_gpu.split(','))))
        except:
            raise Exception(
                f'[!] multi gpu ids are needed, but got: {multi_gpu}')
        assert run_mode in [
            'train', 'test', 'rerank', 'rerank_ir'
        ], f'[!] running mode must be train or test, but got {run_mode}'
        vocab_file = 'data/vocab/vocab_small' if lang == 'zh' else 'data/vocab/vocab_english'
        lr = 1 if lm else 1.5e-4
        self.args = {
            'lr': lr,
            'grad_clip': 1.0,
            'pad': 0,
            'tgt_len_size': 50,
            'lr_gamma': 0.5,
            'patience': 5,
            'min_lr': 1e-5,
            'warmup_steps': 2000,
            'total_steps': total_steps,
            'topk': 20,
            'topp': 1.0,
            'config_path': 'data/config/model_config_dialogue_big.json',
            'multi_gpu': self.gpu_ids,
            'run_mode': run_mode,
            'vocab_file': vocab_file,
            'lang': lang,
            'topic_transfer': {
                '音乐': 'music',
                '体育': 'sport',
                '数码产品': 'electric',
                '美食': 'food',
                '电影': 'movie'
            },
            'balanceddata_parallel_gpu0_size': 2,
            'repetition_penalty': 1,
        }
        # hyperparameters

        self.vocab = BertTokenizer(vocab_file=self.args['vocab_file'])
        self.vocab_size = len(self.vocab)
        self.unk = self.vocab.convert_tokens_to_ids('[UNK]')
        self.sep = self.vocab.convert_tokens_to_ids('[SEP]')
        self.cls = self.vocab.convert_tokens_to_ids('[CLS]')

        self.model = GPT2(self.vocab_size,
                          self.unk,
                          self.sep,
                          self.args['topk'],
                          self.args['topp'],
                          self.args['repetition_penalty'],
                          config_path=self.args['config_path'])

        self.criterion = nn.CrossEntropyLoss(ignore_index=self.args['pad'],
                                             reduction='sum')
        self.optimizer = transformers.AdamW(self.model.parameters(),
                                            lr=self.args['lr'],
                                            correct_bias=True)

        # need to obtain the whole iter
        self.warmup_scheduler = transformers.get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=self.args['warmup_steps'],
            num_training_steps=self.args['total_steps'])
        if torch.cuda.is_available():
            self.model.cuda()

        # train: DataParallel; test: no DataParallel
        if self.args['run_mode'] == 'train':
            self.model = DataParallel(self.model, device_ids=self.gpu_ids)
            # self.model = BalancedDataParallel(
            #         self.args['balanceddata_parallel_gpu0_size'],
            #         self.model,
            #         dim=0)

        # run_mode == 'chatbot', use the bertretrieval for reranking
        if run_mode in ['rerank', 'rerank_ir']:
            from multiview import MultiView
            print(f'[!] MultiView reranker model will be initized')
            self.reranker = MultiView(
                topic=True,
                length=True,
                nidf_tf=True,
                coherence=True,
                fluency=True,
                repetition_penalty=True,
                mmi=True,
                distinct=True,
                mmi_path='ckpt/train_generative/gpt2_mmi/best.pt',
                coherence_path='ckpt/train_retrieval/bertretrieval/best.pt',
                topic_path='ckpt/fasttext/model.bin',
                fluency_path='ckpt/LM/gpt2lm/best.pt',
            )
            print(f'[!] load multiview model over')

        if run_mode == 'rerank_ir':
            self.ir_agent = TestAgent()

        self.show_parameters(self.args)

    def train_model(self, train_iter, mode='train', recoder=None):
        self.model.train()
        total_loss, total_acc, batch_num = 0, [], 0
        pbar = tqdm(train_iter)
        oom_time = 0
        try:
            for idx, batch in enumerate(pbar):
                cid = batch

                self.optimizer.zero_grad()

                logits = self.model(cid)  # [batch, seq, vocab]
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = cid[..., 1:].contiguous()
                loss = self.criterion(
                    shift_logits.view(-1, shift_logits.size(-1)),
                    shift_labels.view(-1))
                _, preds = shift_logits.max(dim=-1)  # [batch, seq]
                # ignore the pad
                not_ignore = shift_labels.ne(self.args['pad'])  # pad is 0 or 1
                num_targets = not_ignore.long().sum().item(
                )  # the number of not pad tokens
                correct = (shift_labels == preds) & not_ignore
                correct = correct.float().sum()
                # loss and token accuracy
                accuracy = correct / num_targets
                total_acc.append(accuracy)
                loss = loss / num_targets

                if mode == 'train':
                    loss.backward()
                    clip_grad_norm_(self.model.parameters(),
                                    self.args['grad_clip'])
                    self.optimizer.step()
                    self.warmup_scheduler.step()
                total_loss += loss.item()
                batch_num += 1

                pbar.set_description(
                    f'[!] OOM: {oom_time}, train loss: {round(loss.item(), 4)}, token acc: {round(accuracy.item(), 4)}'
                )
        except RuntimeError as exception:
            if 'out of memory' in str(exception):
                oom_time += 1
                torch.cuda.empty_cache()
            else:
                raise exception
        return round(total_loss / batch_num, 4)

    def test_model_samples(self, test_iter, path, samples=5):
        '''
        Generate `samples` candidates for one given conversation context
        '''
        def filter(x):
            if '[SEP]' in x:
                x = x[:x.index('[SEP]')]
            return x.replace('[PAD]', '').replace('[SEP]', '').strip()

        self.model.eval()
        pbar = tqdm(test_iter)
        max_size = self.args['tgt_len_size']
        with open(path, 'w') as f:
            for batch in pbar:
                c, r = batch  # c: [seq]
                c = c.unsqueeze(0)  # [1, seq]
                c_ = c.expand(samples, c.shape[-1])  # [samples(batch), seq]
                tgt = self.model.predict_batch(c_, max_size)
                tgt = [self.vocab.convert_ids_to_tokens(i) for i in tgt]
                tgt = [filter(' '.join(i)) for i in tgt]

                ctx = self.vocab.convert_ids_to_tokens(c[0])
                ctx = ' '.join(ctx)

                ref = self.vocab.convert_ids_to_tokens(r)
                ref = ' '.join(ref)

                f.write(f'CTX: {ctx}\n')
                f.write(f'REF: {ref}\n')
                for idx, i in enumerate(tgt):
                    f.write(f'TGT{idx}: {i}\n')
                f.write('\n')
        print(f'[!] translate test dataset over, write into {path}')

    def test_model(self, test_iter, path):
        '''
        Generate the test dataset and measure the performance
        '''
        def filter(x):
            return x.replace('[PAD]', '')

        self.model.eval()
        pbar = tqdm(test_iter)
        with open(path, 'w') as f:
            for batch in pbar:
                c, r = batch
                max_size = max(len(r), self.args['tgt_len_size'])
                tgt = self.model.predict(c, max_size)
                text = self.vocab.convert_ids_to_tokens(tgt)
                tgt = ''.join(text)

                ctx = self.vocab.convert_ids_to_tokens(c)
                ctx = filter(''.join(ctx))

                ref = self.vocab.convert_ids_to_tokens(r)
                ref = filter(''.join(ref))

                f.write(f'CTX: {ctx}\n')
                f.write(f'REF: {ref}\n')
                f.write(f'TGT: {tgt}\n\n')
        print(f'[!] translate test dataset over, write into {path}')
        # measure the performance
        (b1, b2, b3,
         b4), ((r_max_l, r_min_l, r_avg_l),
               (c_max_l, c_min_l,
                c_avg_l)), (dist1, dist2, rdist1,
                            rdist2), (average, extrema,
                                      greedy) = cal_generative_metric(
                                          path, lang=self.args['lang'])
        print(
            f'[TEST] BLEU: {b1}/{b2}/{b3}/{b4}; Length(max, min, avg): {c_max_l}/{c_min_l}/{c_avg_l}|{r_max_l}/{r_min_l}/{r_avg_l}; Dist: {dist1}/{dist2}|{rdist1}/{rdist2}; Embedding(average/extrema/greedy): {average}/{extrema}/{greedy}'
        )

    @torch.no_grad()
    def talk(self, topic, msgs, maxlen=50, batch_size=16):
        '''
        topic, msgs: msgs is a string which split with the [SEP] token
        batch size is 1

        n_ctx is 300/512

        if the topic of the msgs is very low, append the trigger sentences into the msgs
        '''
        if topic is None:
            self.reranker.mode['topic'] = False
        else:
            # detect the topic of the msgs
            if not self.reranker.topic_scores(msgs, topic):
                trigger_s = random.choice(self.trigger_utterances[topic])
                msgs = f'{trigger_s} [SEP] {msgs}'
                print(f'[!] topic trigger mode is set up: {msgs}')
        # tokenizer
        if self.args['run_mode'] == 'test':
            msgs = torch.LongTensor(self.vocab.encode(msgs)[-(512 - maxlen):])
            msgs = to_cuda(msgs)
            tgt = self.model.predict(msgs, maxlen)
            tgt = self.vocab.convert_ids_to_tokens(tgt)
            tgt = ''.join(tgt)
            return tgt
        elif self.args['run_mode'] in ['rerank', 'rerank_ir']:
            # ========== predict_batch ==========
            msgs_ = self.vocab.encode(msgs)[-(512 - maxlen):]
            msgs_ = [deepcopy(msgs_) for _ in range(batch_size)]
            msgs_ = torch.LongTensor(msgs_)  # [batch, seq]
            msgs_ = to_cuda(msgs_)
            tgt = self.model.predict_batch(msgs_, maxlen)
            tgt = [self.vocab.convert_ids_to_tokens(i) for i in tgt]
            # cut from the first [SEP] token
            n_tgt = []
            for i in tgt:
                if '[SEP]' in i:
                    i = i[:i.index('[SEP]')]
                n_tgt.append(''.join(i))
            # multiview scores
            # rerank_ir also use the fast retrieval model
            if self.args['run_mode'] == 'rerank_ir':
                retrieval_rest = self.ir_agent.model.search(topic,
                                                            msgs,
                                                            samples=batch_size)
                retrieval_rest = [i['response'] for i in retrieval_rest]
                # remove the utterances that in the self.history
                retrieval_rest = list(set(retrieval_rest) - set(self.history))
                n_tgt.extend(retrieval_rest)
            contexts = [msgs] * len(n_tgt)
            if topic:
                topic = [self.args['topic_transfer'][topic]] * len(n_tgt)
                scores = self.reranker(contexts,
                                       n_tgt,
                                       topic=topic,
                                       history=self.history)[0]
            else:
                scores = self.reranker(contexts, n_tgt, topic=None)[0]
            index = np.argmax(scores)
            if index > batch_size:
                print(
                    f'[!] 从检索式对话系统中选择回复; bs/length/index: {batch_size}/{len(n_tgt)}/{index}'
                )
            else:
                print(
                    f'[!] 从生成式对话系统中选择回复; bs/length/index: {batch_size}/{len(n_tgt)}/{index}'
                )
            response = n_tgt[index]
            return response
        else:
            raise Exception(f'[!] error in gpt2 model `talk` function')
Пример #5
0
def classFactory(iface):
    # load MultiView class from file MultiView
    from multiview import MultiView
    return MultiView(iface)