Пример #1
0
    def _get_users_subscription(self, user_ids):
        '''
        Метод получает подписки пользователей и обЪединяет их в одно множество
         уникальных id групп(self.user_subscription). Принимает один параметр список id пользователей.

        '''
        self._set_program_state('Обработка данных')

        requests_list = list(
            ('API.groups.get({"user_id":' + str(item) + '}).items,' for item in
             user_ids)
        )
        self._send_message('Подготовка дополнительных запросов.', state=requests_list)
        if len(requests_list) > 0:
            self._send_message(f'Подготовлено {len(requests_list)} дополнительных запросов')
            one_percent = len(requests_list) / 100
            progressbar = ProgressBar(60)

            while requests_list:
                temp_list = requests_list[:25]
                requests_list = requests_list[25:]
                _requests = ''.join(temp_list)
                percent = len(temp_list) / one_percent
                request = self._api_execute(f'return [{_requests}];')
                for item in request:
                    if item:
                        for el in item:
                            self.users_subscription.add(el)
                progressbar.set_progress(percent)
            self._send_message(f'Добавлено {len(self.users_subscription)} уникальных group_id')
Пример #2
0
def evaluate(args, model, tokenizer, prefix=""):
    eval_task_names = (args.task_name,)
    eval_outputs_dirs = (args.output_dir,)
    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, data_type='dev')
        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size,
                                     collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn)

        # Eval!
        logger.info("********* Running evaluation {} ********".format(prefix))
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating")
        for step, batch in enumerate(eval_dataloader):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)
            with torch.no_grad():
                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'labels': batch[3]}
                if args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet', 'albert',
                                                                               'roberta'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
            pbar(step)
        print(' ')
        if 'cuda' in str(args.device):
            torch.cuda.empty_cache()
        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        logger.info("******** Eval results {} ********".format(prefix))
        for key in sorted(result.keys()):
            logger.info(" dev: %s = %s", key, str(result[key]))
    return results
def evaluate(valid_loader,model):
    pbar = ProgressBar(n_total=len(valid_loader),desc='Evaluating')
    valid_loss = AverageMeter()
    valid_acc = AverageMeter()
    model.eval()
    count = 0
    with torch.no_grad():
        for batch_idx,(data, target) in enumerate(valid_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = loss_fn(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct = pred.eq(target.view_as(pred)).sum().item()
            valid_loss.update(loss,n = data.size(0))
            valid_acc.update(correct, n=1)
            count += data.size(0)
            pbar(step=batch_idx)
    return {'valid_loss':valid_loss.avg,
            'valid_acc':valid_acc.sum /count}
Пример #4
0
def valdation(model, val_dataloader, device, task_type):
    pre_labels = []
    total = 0
    total_correct = 0
    model.eval()
    loss_total = 0
    with torch.no_grad():
        pbar = ProgressBar(n_total=len(val_dataloader), desc='evaldation')
        for step, batch in enumerate(val_dataloader):
            batch = [t.to(device) for t in batch]
            inputs_a = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2]
            }
            inputs_b = {
                'input_ids': batch[3],
                'attention_mask': batch[4],
                'token_type_ids': batch[5]
            }
            labels = batch[6]
            inputs = []
            inputs.append(inputs_a)
            inputs.append(inputs_b)
            output = model(inputs)
            if task_type == "classification":
                pred = torch.argmax(output, dim=1)
                pre_labels.extend(pred.detach().cpu().tolist())
                correct = (labels == pred).sum()
                total_correct += correct
                total += labels.size()[0]
                loss = F.cross_entropy(output, labels)
                pbar(step, {'loss': loss.item()})
            else:
                loss = F.mse_loss(output, labels)
                loss_total += loss.item()
    if task_type == "classification":
        acc = total_correct / total
        return acc, pre_labels
    else:
        return loss_total
def train(args,loaders,model):
    train_monitor = TrainingMonitor(file_dir='./logs/', arch=f'resnet18_{args.norm_type}_{args.batch_size}')
    train_loader,valid_loader = loaders['train'],loaders['valid']
    optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9,weight_decay=1e-4)
    for epoch in range(1,args.epochs + 1):
        pbar = ProgressBar(n_total=len(train_loader), desc='Training')
        train_loss = AverageMeter()
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()
            pbar(step=batch_idx, info={'loss': loss.item()})
            train_loss.update(loss.item(), n=1)
        valid_log = evaluate(valid_loader, model)
        train_log = {'loss':train_loss.avg}
        logs = dict(train_log, **valid_log)
        show_info = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()])
        print(show_info)
        train_monitor.epoch_step(logs)
Пример #6
0
def predict(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    pred_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    pred_outputs_dirs = (args.output_dir, args.output_dir +
                         '-MM') if args.task_name == "mnli" else (
                             args.output_dir, )

    results = {}
    for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs):
        pred_dataset = load_and_cache_examples(args,
                                               pred_task,
                                               tokenizer,
                                               data_type='test')
        if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(pred_output_dir)

        args.pred_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        pred_sampler = SequentialSampler(
            pred_dataset) if args.local_rank == -1 else DistributedSampler(
                pred_dataset)
        pred_dataloader = DataLoader(
            pred_dataset,
            sampler=pred_sampler,
            batch_size=args.pred_batch_size,
            collate_fn=xlnet_collate_fn
            if args.model_type in ['xlnet'] else collate_fn)

        logger.info("***** Running prediction {} *****".format(prefix))
        logger.info("  Num examples = %d", len(pred_dataset))
        logger.info("  Batch size = %d", args.pred_batch_size)
        nb_pred_steps = 0
        preds = None
        pbar = ProgressBar(n_total=len(pred_dataloader), desc="Predicting")
        for step, batch in enumerate(pred_dataloader):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[3]
                }
                if args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2] if (
                        'bert' in args.model_type or 'xlnet' in args.model_type
                    ) else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                _, logits = outputs[:2]
            nb_pred_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            pbar(step)
        print(' ')
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        output_pred_file = os.path.join(pred_output_dir, prefix,
                                        "test_prediction.txt")
        with open(output_pred_file, "w") as writer:
            for pred in preds:
                writer.write(str(pred) + '\n')
    return results
Пример #7
0
def train(args, train_dataset, model, tokenizer, label_list, pad_token_label_id):
    """ Train the model """
    # 载入数据
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,
                                  collate_fn=collate_fn)
    # 总训练步数
    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    # 优化器
    if args.optimizer.lower() == "adamw":
        no_decay = ["bias", "LayerNorm.weight"]
        # bert_parameters = eval('model.{}'.format(args.model_type)).named_parameters()
        base_model_param_optimizer = list(model.BaseModel.named_parameters())
        start_parameters = model.start_fc.named_parameters()
        end_parameters = model.end_fc.named_parameters()
        args.bert_lr = args.bert_lr if args.bert_lr else args.learning_rate
        args.start_lr = args.start_lr if args.start_lr else args.learning_rate
        args.end_lr = args.end_lr if args.end_lr else args.learning_rate
        optimizer_grouped_parameters = [
            {"params": [p for n, p in base_model_param_optimizer if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
            "lr": args.bert_lr},
            {"params": [p for n, p in base_model_param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
            "lr": args.bert_lr},

            {"params": [p for n, p in start_parameters if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
            "lr": args.start_lr},
            {"params": [p for n, p in start_parameters if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
            "lr": args.start_lr},

            {"params": [p for n, p in end_parameters if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
            "lr": args.end_lr},
            {"params": [p for n, p in end_parameters if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
            "lr": args.end_lr}
        ]

        if "lstm" in args.model_type.lower():
             lstm_param_optimizer = list(model.Bilstm.named_parameters())
             optimizer_grouped_parameters.extend([{'params': [p for n, p in lstm_param_optimizer if not any(nd in n for nd in no_decay)],
                                                   'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate},
                                                  {'params': [p for n, p in lstm_param_optimizer if any(nd in n for nd in no_decay)],
                                                   'weight_decay': 0.0,'lr': args.crf_learning_rate}])

        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    else:
        optimizer = Lamb(model.parameters())
    # 学习率
    args.warmup_steps = int(t_total * args.warmup_proportion)    # 学习率预热
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
                                                num_training_steps=t_total)

    # 继续训练
    if args.continue_train and \
        os.path.isfile(os.path.join(args.continue_train_checkpoint, "optimizer.pt")) and \
        os.path.isfile(os.path.join(args.continue_train_checkpoint, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
            logger.info("using fp16 !!!")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    # 多GPU训练 (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # adversarial_training
    if args.adv_training == 'fgm':
        adv = FGM(model=model, param_name='word_embeddings')
    elif args.adv_training == 'pgd':
        adv = PGD(model=model, param_name='word_embeddings') 

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                args.train_batch_size
                * args.gradient_accumulation_steps,
                )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    seed_everything(args.seed)  # 复现
    for _ in range(int(args.num_train_epochs)):
        logger.info(f"############### Epoch_{_} ###############")
        pbar = ProgressBar(n_total=len(train_dataloader), desc='Training')
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1],
                      "start_positions": batch[5], "end_positions": batch[6]}
            if args.model_type != "distilbert":   # XLM and RoBERTa don"t use segment_ids
                inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None)
            outputs = model(**inputs)
            loss = outputs[0] 
            if args.n_gpu > 1:
                loss = loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if args.adv_training:
                adv.adversarial_training(args, inputs, optimizer)             
            
            pbar(step, {'loss': loss.item()})
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                scheduler.step()
                optimizer.step()
                model.zero_grad()
                global_step += 1
                if args.logging_steps > 0 and global_step % args.logging_steps == 0:  # 验证集
                    logger.info("\n global_step: %s", global_step)
                    logger.info("average tr_loss: %s", tr_loss/global_step)
                    evaluate(args, model, tokenizer, label_list, pad_token_label_id)
                if args.save_steps > 0 and global_step % args.save_steps == 0:  # 保存模型
                    logger.info("global_step: %s 模型已保存!", global_step)
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    logger.info("Saving model checkpoint to %s", output_dir)
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    tokenizer.save_vocabulary(output_dir)
                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))

        logger.info("\n")
        if 'cuda' in str(args.device):
            torch.cuda.empty_cache()
    return global_step, tr_loss / global_step
Пример #8
0
def predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=""):
    pred_output_dir = args.output_dir
    if not os.path.exists(pred_output_dir):
        os.makedirs(pred_output_dir)
    test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id,data_type='test')
    # Note that DistributedSampler samples randomly
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn)
    ### 
    span_labels = []
    for label in label_list:
        label = label.split('-')[-1]
        if label not in span_labels:
            span_labels.append(label)
    span_map = {i: label for i, label in enumerate(span_labels)}

    # Eval
    logger.info("***** Running prediction %s *****", prefix)
    logger.info("  Num examples = %d", len(test_dataset))
    logger.info("  Batch size = %d", 1)
    results = []   # 全部测试结果
    error_results=[]   # 预测错误结果
    true_labels = []   # 真实标签
    predict_labels = []   # 预测标签
    output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt")
    error_predict_file = os.path.join(pred_output_dir, prefix, "Error_test_prediction.txt")
    pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting")

    if isinstance(model, torch.nn.DataParallel):  # 多GPU训练
        model = model.module
    for step, batch in enumerate(test_dataloader):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1],
                      "start_positions": batch[5], "end_positions": batch[6]}
            if args.model_type != "distilbert":   # XLM and RoBERTa don"t use segment_ids
                inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None)
            outputs = model(**inputs)
            tmp_eval_loss, start_logits, end_logits = outputs[:3]

        start_preds = start_logits.detach().cpu().numpy() 
        end_preds = end_logits.detach().cpu().numpy()

        start_preds = np.argmax(start_preds, axis=2)  
        end_preds = np.argmax(end_preds, axis=2)

        start_preds_list = [span_map[j] for j in start_preds[0][1:-1]]
        end_preds_list = [span_map[j] for j in end_preds[0][1:-1]]

        batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()[1:-1]
        batch_true_labels = [args.id2label.get(i) for i in batch_true_labels]
        true_labels.append(batch_true_labels)
        
        batch_predict_labels = convert_span_to_bio([start_preds_list], [end_preds_list])
        predict_labels.extend(batch_predict_labels)
        input_ids = inputs["input_ids"].squeeze(0).cpu().numpy().tolist()[1:-1]
        sent = ""

        ifError=False
        for input_id,pre,lab in zip(input_ids, batch_predict_labels[0], batch_true_labels):
            sent+=" ".join([tokenizer.ids_to_tokens[input_id],lab,pre])+"\n"
            if lab != pre:
                ifError=True
        sent+="\n"
        results.append(sent)
        if ifError:
            error_results.append(sent)
            ifError = False
        pbar(step)
        # 计算测试集 acc, recall, f1

    logger.info("\n测试集结果统计:")
    logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels)))
    logger.info("p: %s", str(precision_score(true_labels, predict_labels)))
    logger.info("r: %s", str(recall_score(true_labels, predict_labels)))
    logger.info("f1: %s", str(f1_score(true_labels, predict_labels)))
    logger.info("classification report: ")
    logger.info(str(classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2)))
    logger.info("\n")

    with open(output_predict_file, "w",encoding="utf-8") as writer:
        for record in results:
            writer.write(record)

    with open(error_predict_file, "w",encoding="utf-8") as writer:
        for record in error_results:
            writer.write(record)
Пример #9
0
def evaluate(args, model, tokenizer, label_list, pad_token_label_id):
    eval_output_dir = args.output_dir
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)
    eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='dev')
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    ##
    span_labels = []
    for label in label_list:
        label = label.split('-')[-1]
        if label not in span_labels:
            span_labels.append(label)
    span_map = {i: label for i, label in enumerate(span_labels)}

    # Eval
    logger.info("***** Running evaluation %s *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    true_labels = []
    predict_labels = []
    model.eval()
    pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating")
    for step, batch in enumerate(eval_dataloader):
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1],
                      "start_positions": batch[5], "end_positions": batch[6]}
            if args.model_type != "distilbert":   # XLM and RoBERTa don"t use segment_ids
                inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None)
            outputs = model(**inputs)
            tmp_eval_loss, start_logits, end_logits = outputs[:3]
        if args.n_gpu > 1:
            tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
        eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1
        
        start_preds = start_logits.detach().cpu().numpy() # [64, 128, 5]
        end_preds = end_logits.detach().cpu().numpy()

        start_preds = np.argmax(start_preds, axis=2)  # [64, 128]
        end_preds = np.argmax(end_preds, axis=2)

        start_preds_list = []
        end_preds_list = []

        batch_true_labels = batch[4].squeeze(0).cpu().numpy().tolist()
        for index, input_length in enumerate(batch[3]):  # batch[3] 每句长度
            start_preds_list.append([span_map[j] for j in start_preds[index][:input_length]][1:-1])
            end_preds_list.append([span_map[j] for j in end_preds[index][:input_length]][1:-1])
            batch_true = [args.id2label.get(i) for i in batch_true_labels[index][:input_length]][1:-1]
            true_labels.append(batch_true)
        
        batch_predict_labels = convert_span_to_bio(start_preds_list, end_preds_list)
        predict_labels.extend(batch_predict_labels)

        pbar(step)

    logger.info("\n")
    logger.info("average eval_loss: %s", str(eval_loss/nb_eval_steps))
    logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels)))
    logger.info("p: %s", str(precision_score(true_labels, predict_labels)))
    logger.info("r: %s", str(recall_score(true_labels, predict_labels)))
    logger.info("f1: %s", str(f1_score(true_labels, predict_labels)))
    logger.info("classification report: ")
    logger.info(str(classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2)))
Пример #10
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    ###################################################################
    # 初始化一个pandas DataFrame进行训练日志的存储
    df_path = args.output_dir + "/bert" + "df_log.pickle"
    if not os.path.isfile(df_path):
        df = pd.DataFrame(columns=[
            "epoch", "train_loss", "train_auc", "test_loss", "test_auc"
        ])
        df.to_pickle(df_path)
        print("log DataFrame created!")
    #################################################################
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate_fn)

    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs
    args.warmup_steps = int(t_total * args.warmup_proportion)
    no_decay = ['bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)

    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()

    seed_everything(args.seed)
    for _ in range(int(args.num_train_epochs)):
        all_predictions, all_labels = [], []
        pbar = ProgressBar(n_total=len(train_dataloader), desc='Train')
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in [
                    'bert', 'xlnet', 'albert', 'roberta'
                ] else None  # XLM, DistilBERT don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[0]

            #############################################################################################################################
            # 提取预测的结果和标记, 并存到all_predictions, all_labels里
            # 用来计算auc
            # 存储所有预测的结果和标记, 用来计算auc

            predictions = fl.softmax(outputs[1], dim=-1)[:, 1]
            predictions = predictions.detach().cpu().numpy().reshape(
                -1).tolist()
            labels = batch[3].cpu().numpy().reshape(-1).tolist()
            all_predictions.extend(predictions)
            all_labels.extend(labels)
            # 计算auc
            fpr, tpr, thresholds = metrics.roc_curve(y_true=all_labels,
                                                     y_score=all_predictions)
            auc = metrics.auc(fpr, tpr)

            #############################################################################################################################

            if args.n_gpu > 1:
                loss = loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            tr_loss += loss.item()
            ###############################################################################
            log_dic = {
                "epoch": _,
                "train_loss": tr_loss / (step + 1),
                "train_auc": auc,
                "test_loss": 0,
                "test_auc": 0
            }
            if step % 10 == 0:
                print((str({k: v for k, v in log_dic.items() if v != 0})))

            df = pd.read_pickle(df_path)
            df = df.append([log_dic])
            df.reset_index(inplace=True, drop=True)
            df.to_pickle(df_path)
            ##############################################################################
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                # flag
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:

                    if args.local_rank == -1:
                        results = evaluate(args, model, tokenizer)

                # 每args.save_steps步保存一次模型
                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:

                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)
                    tokenizer.save_vocabulary(vocab_path=output_dir)
            pbar(step, {'loss': loss.item()})
        ####################################################################################################################
        global all_auc
        global threshold

        all_auc.append(auc)
        best_auc = max(all_auc)
        if all_auc[-1] < best_auc:
            threshold += 1
            args.learning_rate *= 0.8
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              eps=args.adam_epsilon)
            predict(args, model, tokenizer, prefix="")
        else:
            # 如果
            threshold = 0

        if threshold >= 5:
            print("epoch {} has the lowest loss".format(
                0 + np.argmax(np.array(all_auc))))
            print("early stop!")
            break
        #####################################################################################################################
        print(" ")
        if 'cuda' in str(args.device):
            torch.cuda.empty_cache()
    return global_step, tr_loss / global_step
Пример #11
0
def evaluate(args, model, tokenizer, label_list, pad_token_label_id):
    eval_output_dir = args.output_dir
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)
    eval_dataset = load_and_cache_examples(args,
                                           args.task_name,
                                           tokenizer,
                                           label_list,
                                           pad_token_label_id,
                                           data_type='dev')
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate_fn)
    # Eval
    logger.info("***** Running evaluation %s *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    true_labels = []
    predict_labels = []
    model.eval()
    pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating")
    for step, batch in enumerate(eval_dataloader):
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3],
                'input_lens': batch[4]
            }
            if args.model_type != "distilbert":  # XLM and RoBERTa don"t use segment_ids
                inputs["token_type_ids"] = (batch[2] if args.model_type
                                            in ["bert", "xlnet"] else None)
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
            batch_predict_labels = model.crf.decode(logits,
                                                    inputs['attention_mask'])
        if args.n_gpu > 1:
            tmp_eval_loss = tmp_eval_loss.mean(
            )  # mean() to average on multi-gpu parallel evaluating
        eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1

        batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()
        pbar(step)
        for index, input_length in enumerate(batch[4]):
            batch_true = [
                args.id2label.get(i)
                for i in batch_true_labels[index][:input_length]
            ][1:-1]
            batch_predict = [
                args.id2label.get(i)
                for i in batch_predict_labels[index][:input_length]
            ][1:-1]
            true_labels.append(batch_true)
            predict_labels.append(batch_predict)

    logger.info("\n")
    logger.info("average eval_loss: %s", str(eval_loss / nb_eval_steps))
    logger.info("accuary: %s", str(accuracy_score(true_labels,
                                                  predict_labels)))
    logger.info("p: %s", str(precision_score(true_labels, predict_labels)))
    logger.info("r: %s", str(recall_score(true_labels, predict_labels)))
    logger.info("f1: %s", str(f1_score(true_labels, predict_labels)))
    logger.info("classification report: ")
    logger.info(
        str(
            classification_report(true_labels,
                                  predict_labels,
                                  mode='strict',
                                  scheme=IOB2)))
Пример #12
0
def train(args):
    logger.info("args: %s", args)
    tokenizer = BertTokenizer.from_pretrained(args.pretrained)
    config = BertConfig.from_pretrained(args.pretrained)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    task_type = args.task_type
    model = SentenceBert.from_pretrained(
        config=config,
        pretrained_model_name_or_path=args.pretrained,
        max_len=args.max_len,
        tokenizer=tokenizer,
        device=device,
        task_type=task_type)
    model.to(device)

    train_dataset = DataReader(tokenizer=tokenizer,
                               filepath=args.train_file,
                               max_len=args.max_len,
                               task_type=task_type)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True)

    val_dataset = DataReader(tokenizer=tokenizer,
                             filepath=args.val_file,
                             max_len=args.max_len,
                             task_type=task_type)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                shuffle=True)

    optimizer = AdamW(model.parameters(), lr=args.lr)
    scheduler = ReduceLROnPlateau(optimizer=optimizer,
                                  mode='max',
                                  factor=0.5,
                                  patience=2)
    re_scheduler = ReduceLROnPlateau(optimizer=optimizer,
                                     mode='min',
                                     factor=0.5,
                                     patience=2)

    model.train()
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataloader))
    logger.info("  Num Epochs = %d", args.epochs)
    best_acc = 0.0
    re_loss_min = 1000000.0
    for epoch in range(args.epochs):
        pbar = ProgressBar(n_total=len(train_dataloader), desc='Training')
        for step, batch in enumerate(train_dataloader):
            batch = [t.to(device) for t in batch]
            inputs_a = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2]
            }
            inputs_b = {
                'input_ids': batch[3],
                'attention_mask': batch[4],
                'token_type_ids': batch[5]
            }
            labels = batch[6]
            inputs = []
            inputs.append(inputs_a)
            inputs.append(inputs_b)
            output = model(inputs)
            if task_type == "classification":
                loss = F.cross_entropy(output, labels)
            else:
                loss = F.mse_loss(output, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            pbar(step, {'loss': loss.item()})

        time_srt = datetime.now().strftime('%Y-%m-%d')
        if task_type == "classification":
            # train_acc = valdation(model,train_dataloader,device,task_type)
            val_acc = valdation(model, val_dataloader, device, task_type)
            scheduler.step(val_acc)

            if val_acc > best_acc:
                best_acc = val_acc
                save_path = os.path.join(args.model_out, "classification",
                                         "20W_SBert_best_new" + time_srt)
                if not os.path.exists(save_path):
                    os.makedirs(save_path)
                logger.info("save model")
                model.save_pretrained(save_path)
                tokenizer.save_vocabulary(save_path)
            # logger.info("train_acc: %.4f------val_acc:%.4f------best_acc:%.4f"%(train_acc,val_acc,best_acc))
            logger.info("val_acc:%.4f------best_acc:%.4f" %
                        (val_acc, best_acc))
        else:
            # re_train_loss = valdation(model, train_dataloader, device, task_type)
            re_val_loss = valdation(model, val_dataloader, device, task_type)
            re_scheduler.step(re_val_loss)

            if re_loss_min > re_val_loss:
                re_loss_min = re_val_loss
                save_path = os.path.join(args.model_out, "regression",
                                         "20W_SBert_best_new_bak_" + time_srt)
                if not os.path.exists(save_path):
                    os.makedirs(save_path)
                logger.info("save model")
                model.save_pretrained(save_path)
                tokenizer.save_vocabulary(save_path)

            # logger.info("re_train_loss:%.4f------re_val_loss: %.4f------re_loss_min:%.4f" % (re_train_loss,re_val_loss, re_loss_min))
            logger.info("re_val_loss: %.4f------re_loss_min:%.4f" %
                        (re_val_loss, re_loss_min))
Пример #13
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=xlnet_collate_fn if
                                  args.model_type in ['xlnet'] else collate_fn)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs
    args.warmup_steps = int(t_total * args.warmup_proportion)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to global_step of last saved checkpoint from model path
        try:
            global_step = int(
                args.model_name_or_path.split("-")[-1].split("/")[0])
        except ValueError:
            global_step = 0
        epochs_trained = global_step // (len(train_dataloader) //
                                         args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    seed_everything(
        args.seed
    )  # Added here for reproductibility (even between python 2 and 3)
    for _ in range(int(args.num_train_epochs)):
        pbar = ProgressBar(n_total=len(train_dataloader), desc='Training')
        for step, batch in enumerate(train_dataloader):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in [
                    'bert', 'xlnet', 'albert', 'roberta'
                ] else None  # XLM, DistilBERT don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            pbar(step, {'loss': loss.item()})
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    print(" ")
                    logs = {}
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)
        print(" ")
        if 'cuda' in str(args.device):
            torch.cuda.empty_cache()
    if args.local_rank in [-1, 0]:
        tb_writer.close()
    return global_step, tr_loss / global_step
Пример #14
0
def predict(args, model, tokenizer, label_list, prefix=""):
    pred_task_names = (args.task_name, )
    pred_outputs_dirs = (args.output_dir, )
    label_map = {i: label for i, label in enumerate(label_list)}

    for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs):
        pred_dataset = load_and_cache_examples(args,
                                               pred_task,
                                               tokenizer,
                                               data_type='test')
        if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(pred_output_dir)

        args.pred_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        pred_sampler = SequentialSampler(
            pred_dataset) if args.local_rank == -1 else DistributedSampler(
                pred_dataset)
        pred_dataloader = DataLoader(
            pred_dataset,
            sampler=pred_sampler,
            batch_size=args.pred_batch_size,
            collate_fn=xlnet_collate_fn
            if args.model_type in ['xlnet'] else collate_fn)

        logger.info("******** Running prediction {} ********".format(prefix))
        logger.info("  Num examples = %d", len(pred_dataset))
        logger.info("  Batch size = %d", args.pred_batch_size)
        nb_pred_steps = 0
        preds = None
        pbar = ProgressBar(n_total=len(pred_dataloader), desc="Predicting")
        for step, batch in enumerate(pred_dataloader):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[3]
                }
                if args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2] if (
                        'bert' in args.model_type or 'xlnet' in args.model_type
                    ) else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                _, logits = outputs[:2]
            nb_pred_steps += 1
            if preds is None:
                if pred_task == 'copa':
                    preds = logits.softmax(-1).detach().cpu().numpy()
                else:
                    preds = logits.detach().cpu().numpy()
            else:
                if pred_task == 'copa':
                    preds = np.append(
                        preds,
                        logits.softmax(-1).detach().cpu().numpy(),
                        axis=0)
                else:
                    preds = np.append(preds,
                                      logits.detach().cpu().numpy(),
                                      axis=0)
            pbar(step)
        print(' ')
        if args.output_mode == "classification":
            predict_label = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            predict_label = np.squeeze(preds)
        if pred_task == 'copa':
            predict_label = []
            pred_logits = preds[:, 1]
            i = 0
            while (i < len(pred_logits) - 1):
                if pred_logits[i] >= pred_logits[i + 1]:
                    predict_label.append(0)
                else:
                    predict_label.append(1)
                i += 2
        output_submit_file = os.path.join(pred_output_dir, prefix,
                                          "test_prediction.json")
        output_logits_file = os.path.join(pred_output_dir, prefix,
                                          "test_logits")
        # 保存标签结果
        with open(output_submit_file, "w") as writer:
            for i, pred in enumerate(predict_label):
                json_d = {}
                json_d['id'] = i
                json_d['label'] = str(label_map[pred])
                writer.write(json.dumps(json_d) + '\n')
        # 保存中间预测结果
        save_numpy(file_path=output_logits_file, data=preds)
Пример #15
0
def predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=""):
    pred_output_dir = args.output_dir
    if not os.path.exists(pred_output_dir):
        os.makedirs(pred_output_dir)
    test_dataset = load_and_cache_examples(args,
                                           args.task_name,
                                           tokenizer,
                                           label_list,
                                           pad_token_label_id,
                                           data_type='test')
    # Note that DistributedSampler samples randomly
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset,
                                 sampler=test_sampler,
                                 batch_size=1,
                                 collate_fn=collate_fn)  # 每次只有一条数据
    # Eval
    logger.info("***** Running prediction %s *****", prefix)
    logger.info("  Num examples = %d", len(test_dataset))
    logger.info("  Batch size = %d", 1)
    results = []  # 全部测试结果
    error_results = []  # 预测错误结果
    true_labels = []  # 真实标签
    predict_labels = []  # 预测标签
    output_predict_file = os.path.join(pred_output_dir, prefix,
                                       "test_prediction.txt")
    error_predict_file = os.path.join(pred_output_dir, prefix,
                                      "Error_test_prediction.txt")
    pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting")

    if isinstance(model, torch.nn.DataParallel):  # 多GPU训练
        model = model.module
    for step, batch in enumerate(test_dataloader):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": None,
                'input_lens': batch[4]
            }
            if args.model_type != "distilbert":  # XLM and RoBERTa don"t use segment_ids
                inputs["token_type_ids"] = (batch[2] if args.model_type
                                            in ["bert", "xlnet"] else None)
            outputs = model(**inputs)
            logits = outputs[0]
            batch_predict_labels = model.crf.decode(logits,
                                                    inputs['attention_mask'])

        batch_predict_labels = batch_predict_labels[0][
            1:-1]  # [CLS]XXXX[SEP] 每次只有一条数据
        batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()[1:-1]
        input_ids = inputs["input_ids"].squeeze(0).cpu().numpy().tolist()[1:-1]

        sent = ""

        ifError = False
        for input_id, pre, lab in zip(input_ids, batch_predict_labels,
                                      batch_true_labels):
            sent += " ".join([
                tokenizer.ids_to_tokens[input_id], args.id2label[lab],
                args.id2label[pre]
            ]) + "\n"
            if args.id2label[lab] != args.id2label[pre]:
                ifError = True
        sent += "\n"
        results.append(sent)
        if ifError:
            error_results.append(sent)
            ifError = False
        pbar(step)
        # 计算测试集 acc, recall, f1
        batch_true = [args.id2label.get(i) for i in batch_true_labels]
        batch_predict = [args.id2label.get(i) for i in batch_predict_labels]
        assert len(batch_true) == len(batch_predict)
        true_labels.append(batch_true)
        predict_labels.append(batch_predict)

    logger.info("\n测试集结果统计:")
    logger.info("accuary: %s", str(accuracy_score(true_labels,
                                                  predict_labels)))
    logger.info("p: %s", str(precision_score(true_labels, predict_labels)))
    logger.info("r: %s", str(recall_score(true_labels, predict_labels)))
    logger.info("f1: %s", str(f1_score(true_labels, predict_labels)))
    logger.info("classification report: ")
    logger.info(
        str(
            classification_report(true_labels,
                                  predict_labels,
                                  mode='strict',
                                  scheme=IOB2)))
    logger.info("\n")

    with open(output_predict_file, "w", encoding="utf-8") as writer:
        for record in results:
            writer.write(record)

    with open(error_predict_file, "w", encoding="utf-8") as writer:
        for record in error_results:
            writer.write(record)
Пример #16
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=xlnet_collate_fn if
                                  args.model_type in ['xlnet'] else collate_fn)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs
    args.warmup_steps = int(t_total * args.warmup_proportion)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    seed_everything(
        args.seed
    )  # Added here for reproductibility (even between python 2 and 3)
    for _ in range(int(args.num_train_epochs)):
        pbar = ProgressBar(n_total=len(train_dataloader), desc='Training')
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in [
                    'bert', 'xlnet', 'albert', 'roberta'
                ] else None  # XLM, DistilBERT don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            pbar(step, {'loss': loss.item()})
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    print(" ")
                    # Log metrics
                    if args.local_rank == -1:  # Only evaluate when single GPU otherwise metrics may not average well
                        evaluate(args, model, tokenizer)

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)
                    tokenizer.save_vocabulary(vocab_path=output_dir)
        print(" ")
        if 'cuda' in str(args.device):
            torch.cuda.empty_cache()
    return global_step, tr_loss / global_step
Пример #17
0
def predict(args, model, tokenizer, prefix=""):

    pred_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    pred_outputs_dirs = (args.output_dir, args.output_dir +
                         '-MM') if args.task_name == "mnli" else (
                             args.output_dir, )

    results = {}
    for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs):
        pred_dataset = load_and_cache_examples(args,
                                               pred_task,
                                               tokenizer,
                                               data_type='test')
        if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(pred_output_dir)


############################################################################################
        args.pred_batch_size = 64 * max(
            1, args.n_gpu)  ######          DEFAULT = 128   #
        ############################################################################################

        pred_sampler = SequentialSampler(
            pred_dataset) if args.local_rank == -1 else DistributedSampler(
                pred_dataset)
        pred_dataloader = DataLoader(
            pred_dataset,
            sampler=pred_sampler,
            batch_size=args.pred_batch_size,
            collate_fn=xlnet_collate_fn
            if args.model_type in ['xlnet'] else collate_fn)

        logger.info("***** Running prediction {} *****".format(prefix))
        logger.info("  Num examples = %d", len(pred_dataset))
        logger.info("  Batch size = %d", args.pred_batch_size)
        nb_pred_steps = 0
        preds = None
        pbar = ProgressBar(n_total=len(pred_dataloader), desc="Predict")
        for step, batch in enumerate(pred_dataloader):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[3]
                }
                if args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2] if (
                        'bert' in args.model_type or 'xlnet' in args.model_type
                    ) else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                _, logits = outputs[:2]

            nb_pred_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            pbar(step)
        write_list = session_probability(args.data_dir, preds, 1)

        output_pred_file = os.path.join(pred_output_dir, prefix,
                                        "test_prediction.txt")
        with open(output_pred_file, "w") as writer:
            writer.write('SessionId,Probability\n')
            for id, element in enumerate(write_list):
                writer.write(str(id + 50069) + "," + str(element) + '\n')
    return results