예제 #1
0
def evaluate_each(main_model, sub_model, dataset, bs, metrics, mm="validation_matched"):

    all_labels = []
    all_preds = []
    all_losses = []
    
    iterations = dataset.length("dev", mm) // bs
    eval_iter = GlueIterator(dataset.dataloader("dev", bs, mm))
    
    printInfo = "*** Evaluation of {:s} ***".format(metrics.task_name)
    logging.info(printInfo)

    with torch.no_grad():
        for i in range(1, iterations+1):
        
            main_model.eval()
            sub_model.eval()
            data = eval_iter.next()
        
            if use_gpu:        
                input_ids = data['input_ids'].cuda()
                attention_mask = data['attention_mask'].cuda()
                # token_type_ids = data['token_type_ids'].cuda()
                label = data['labels'].cuda()
            else:
                input_ids = data['input_ids']
                attention_mask = data['attention_mask']
                # token_type_ids = data['token_type_ids']
                label = data['labels']
            
            output_inter = main_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
            output = sub_model(input=output_inter, labels=label)
            loss = output[0].cpu().numpy().tolist()
            label = label.cpu().numpy().tolist()
            
            softmax_layer = torch.nn.Softmax(dim=1)
            
            if metrics.num_labels == 3:                
                pred = [x.index(max(x)) for x in output.logits.cpu().numpy().tolist()]
            elif metrics.num_labels == 2:
                pred = np.round(softmax_layer(output.logits).cpu().t()[1].numpy()).tolist()
            elif metrics.num_labels == 1:
                #print(np.array(label).shape)
                pred = output.logits.cpu().t().numpy().tolist()[0]
                
            # pred = np.round(softmax_layer(output.logits).cpu().t()[1].numpy()).tolist()
            
            
            all_labels += label
            all_preds += pred
            all_losses += [loss]
            
    logging.info("loss = {:.6f}".format(sum(all_losses)/len(all_losses)))
    
    eval_result = metrics.result(np.array(all_labels), np.array(all_preds))
    
    for i in eval_result:
        printInfo = "{:s} = {:.6f}".format(i, eval_result[i])
        logging.info(printInfo)    
예제 #2
0
def main():

    ntasks = len(tasks)

    data_args = list()
    configuration = list()
    sub_models = list()
    datasets = list()
    # train_iter = list()
    # dev_iter = list()
    # test_iter = list()
    sub_optimizer = list()
    metrics = list()
    tokenizer = DistilBertTokenizer.from_pretrained(bert_path,
                                                    cache_dir=cache_dir)

    for i in range(ntasks):
        logger.info("Tasks:" + tasks[i])
        data_args.append(GlueDataArgs(task_name=tasks[i]))
        configuration.append(
            DistilBertConfig.from_pretrained(
                bert_path,
                num_labels=glue_tasks_num_labels[tasks[i].lower()],
                finetuning_task=data_args[i].task_name,
                cache_dir=cache_dir))
        if use_gpu:
            sub_models.append(SequenceClassification(configuration[i]).cuda())
        else:
            sub_models.append(SequenceClassification(configuration[i]))

        datasets.append(
            GlueDataSets(data_args[i],
                         tokenizer=tokenizer,
                         cache_dir=cache_dir))
        sub_optimizer.append(
            torch.optim.AdamW(sub_models[i].parameters(), lr=learning_rate_0))
        metrics.append(ComputeMetrics(data_args[i]))
        logger.info("*** DataSet Ready ***")

    if use_gpu:
        Bert_model = DistilBertModel.from_pretrained(bert_path,
                                                     return_dict=True).cuda()
    else:
        Bert_model = DistilBertModel.from_pretrained(bert_path,
                                                     return_dict=True)

    bert_optimizer = torch.optim.AdamW(Bert_model.parameters(),
                                       lr=learning_rate_0)

    # balaned dataset
    train_num = list()
    for i in range(ntasks):
        train_num.append(datasets[i].length("train"))
    #train_nummax =
    #train_num = [x/train_nummax for x in train_num]
    print(train_num)
    iterations = (epochs * max(train_num) // bs) + 1
    #print(iterations)

    sub_scheduler = list()
    for i in range(ntasks):
        sub_scheduler.append(
            torch.optim.lr_scheduler.LambdaLR(
                sub_optimizer[i], lambda step: (1.0 - step / iterations))
        )  #if step <= frozen else learning_rate_1)
    Bert_scheduler = torch.optim.lr_scheduler.LambdaLR(
        bert_optimizer, lambda step:
        (1.0 - step / iterations))  # if step <= frozen else learning_rate_1

    # datasets[i].dataloader("train", batch_size_train[i])
    train_iter = list()
    for i in range(ntasks):
        train_iter.append(
            GlueIterator(datasets[i].dataloader("train", batch_size_train[i])))

    for i in range(1, iterations + 1):

        if i > frozen:
            for p in Bert_model.parameters():
                p.requires_grad = True
            Bert_model.train()
        elif i == frozen:
            for p in Bert_model.parameters():
                p.requires_grad = True
            Bert_model.train()
            logging.info("#####################################")
            logging.info("Release the Traing of the Main Model.")
            logging.info("#####################################")
        else:
            for p in Bert_model.parameters():
                p.requires_grad = False
            Bert_model.eval()

        losses = list()
        loss_rates = list()

        for j in range(ntasks):
            sub_models[j].train()
            data = train_iter[j].next()

            if use_gpu:
                input_ids = data['input_ids'].cuda()
                attention_mask = data['attention_mask'].cuda()
                #token_type_ids=data['token_type_ids'].cuda()
                label = data['labels'].cuda()
            else:
                input_ids = data['input_ids']
                attention_mask = data['attention_mask']
                #token_type_ids=data['token_type_ids']
                label = data['labels']

            output_inter = Bert_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                return_dict=True)  # token_type_ids=token_type_ids,
            losses.append(sub_models[j](input=output_inter, labels=label)[0])

        losssum = sum(losses).item()
        for j in range(ntasks):
            loss_rates.append(losses[j].item() / losssum)

        loss = 0
        printInfo = 'TOTAL/Train {}/{}, lr:{}'.format(i, iterations,
                                                      Bert_scheduler.get_lr())
        for j in range(ntasks):
            loss += losses[j] * batch_size_train[j]  # * loss_rates[j]
            printInfo += ', loss{}-{:.6f}'.format(j, losses[j])
            sub_optimizer[j].zero_grad()

        logging.info(printInfo)

        if i > frozen:
            bert_optimizer.zero_grad()
        loss.backward()

        if i > frozen:
            bert_optimizer.step()

        for j in range(ntasks):
            sub_optimizer[j].step()
            # sub_scheduler[j].step()

        # Bert_scheduler.step()

        if (i % eval_interval == 0):
            evaluate(Bert_model, sub_models, datasets, batch_size_val, metrics,
                     ntasks)
            save_models(Bert_model, sub_models, ntasks, i)

    evaluate(Bert_model, sub_models, datasets, batch_size_val, metrics, ntasks)
    save_models(Bert_model, sub_models, ntasks, iterations)