Exemplo n.º 1
0
def train(args, train_dataset, model):
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=t_total * 0.1,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        if args.fp16_opt_level == "O2":
            keep_batchnorm_fp32 = False
        else:
            keep_batchnorm_fp32 = True
        model, optimizer = amp.initialize(
            model,
            optimizer,
            opt_level=args.fp16_opt_level,
            keep_batchnorm_fp32=keep_batchnorm_fp32)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = DDP(
            model,
            message_size=250000000,
            gradient_predivide_factor=torch.distributed.get_world_size())

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs = 0
    model.zero_grad()
    model.train()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Train(XX Epoch) Step(X/X) (loss=X.X)",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            batch = tuple(t.to(args.device)
                          for t in batch)  # multi-gpu does scattering it-self
            input_ids, input_mask, segment_ids, start_positions, end_positions = batch
            outputs = model(input_ids, segment_ids, input_mask,
                            start_positions, end_positions)
            loss = outputs  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                scheduler.step()  # Update learning rate schedule\
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                epoch_iterator.set_description(
                    "Train(%d Epoch) Step(%d / %d) (loss=%5.5f)" %
                    (_, global_step, t_total, loss.item()))

        if args.local_rank in [-1, 0]:
            model_checkpoint = 'korquad_{0}_{1}_{2}_{3}.bin'.format(
                args.learning_rate, args.train_batch_size, epochs,
                int(args.num_train_epochs))
            logger.info(model_checkpoint)
            output_model_file = os.path.join(args.output_dir, model_checkpoint)
            if args.n_gpu > 1 or args.local_rank != -1:
                logger.info("** ** * Saving file * ** **(module)")
                torch.save(model.module.state_dict(), output_model_file)
            else:
                logger.info("** ** * Saving file * ** **")
                torch.save(model.state_dict(), output_model_file)
        epochs += 1
    logger.info("Training End!!!")
def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None,
                   output_dir=None, max_seq_length=80, do_train=False, do_eval=False, do_lower_case=False,
                   train_batch_size=24, eval_batch_size=8, learning_rate=2e-5, num_train_epochs=15,
                   warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1,
                   optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""):
    


    # ## Required parameters
    # parser.add_argument("--data_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    # parser.add_argument("--bert_model", default=None, type=str, required=True,
    #                     help="Bert pre-trained model selected in the list: bert-base-uncased, "
    #                          "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    # parser.add_argument("--task_name",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The name of the task to train.")
    # parser.add_argument("--output_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    # parser.add_argument("--max_seq_length",
    #                     default=128,
    #                     type=int,
    #                     help="The maximum total input sequence length after WordPiece tokenization. \n"
    #                          "Sequences longer than this will be truncated, and sequences shorter \n"
    #                          "than this will be padded.")
    # parser.add_argument("--do_train",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run training.")
    # parser.add_argument("--do_eval",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run eval on the dev set.")
    # parser.add_argument("--do_lower_case",
    #                     default=False,
    #                     action='store_true',
    #                     help="Set this flag if you are using an uncased model.")
    # parser.add_argument("--train_batch_size",
    #                     default=32,
    #                     type=int,
    #                     help="Total batch size for training.")
    # parser.add_argument("--eval_batch_size",
    #                     default=8,
    #                     type=int,
    #                     help="Total batch size for eval.")
    # parser.add_argument("--learning_rate",
    #                     default=5e-5,
    #                     type=float,
    #                     help="The initial learning rate for Adam.")
    # parser.add_argument("--num_train_epochs",
    #                     default=3.0,
    #                     type=float,
    #                     help="Total number of training epochs to perform.")
    # parser.add_argument("--warmup_proportion",
    #                     default=0.1,
    #                     type=float,
    #                     help="Proportion of training to perform linear learning rate warmup for. "
    #                          "E.g., 0.1 = 10%% of training.")
    # parser.add_argument("--no_cuda",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether not to use CUDA when available")
    # parser.add_argument("--local_rank",
    #                     type=int,
    #                     default=-1,
    #                     help="local_rank for distributed training on gpus")
    # parser.add_argument('--seed',
    #                     type=int,
    #                     default=42,
    #                     help="random seed for initialization")
    # parser.add_argument('--gradient_accumulation_steps',
    #                     type=int,
    #                     default=1,
    #                     help="Number of updates steps to accumulate before performing a backward/update pass.")
    # parser.add_argument('--optimize_on_cpu',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to perform optimization and keep the optimizer averages on CPU")
    # parser.add_argument('--fp16',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to use 16-bit float precision instead of 32-bit")
    # parser.add_argument('--loss_scale',
    #                     type=float, default=128,
    #                     help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    # args = parser.parse_args()

    
    processors = {
#         "cola": ColaProcessor,
#         "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "stance":StanceProcessor,
        "neg":NegProcessor,
        "tri": TriProcessor
    }

    if local_rank == -1 or no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if fp16:
            logger.info("16-bits training currently not supported in distributed training")
            fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1))

    if gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            gradient_accumulation_steps))

    train_batch_size = int(train_batch_size / gradient_accumulation_steps)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    if not do_train and not do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if do_train:
#         if os.path.exists(output_dir) and os.listdir(output_dir):
        if os.path.exists(output_dir):
            pass
#             raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir))
        else:
            os.makedirs(output_dir, exist_ok=True)

    task_name = task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

#     tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    

    train_examples = None
    num_train_steps = None
    if do_train:
        
        train_df = processor.get_train_df(data_dir)
        test_df = processor.get_test_df(data_dir)
        dev_df = processor.get_dev_df(data_dir)
        
        new_train_df = generate_opp_pers_dataset(train_df)
        
        new_train_df.to_csv(os.path.join(data_dir, "tri_train.tsv"),sep='\t',index=False)
        
        new_test_df = generate_opp_pers_dataset_with_naive(test_df)
        
        new_test_df.to_csv(os.path.join(data_dir, "tri_test.tsv"),sep='\t',index=False)
        
        new_dev_df = generate_opp_pers_dataset_with_naive(dev_df)
        
        new_dev_df.to_csv(os.path.join(data_dir, "tri_dev.tsv"),sep='\t',index=False)
        
        train_examples = processor.get_train_examples(data_dir)
        
        num_train_steps = int(
            len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

    # Prepare model
#     model = BertForSequenceClassification.from_pretrained(bert_model,
#                 cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank), num_labels = 2)

        model = BertForConsistencyCueClassification.from_pretrained('bert-base-uncased', num_labels=2)
        model.to(device)
        
        if fp16:
            model.half()

        if local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank],
                                                              output_device=local_rank)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Prepare optimizer
        if fp16:
            param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_())                                 for n, param in model.named_parameters()]
        elif optimize_on_cpu:
            param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_())                                 for n, param in model.named_parameters()]
        else:
            param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
            ]
        t_total = num_train_steps
#     print(t_total)
    if local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if do_train:
        optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if do_train:

        claim_features = convert_claims_to_features(train_examples, label_list, max_seq_length, tokenizer)
        logger.info("claims features done")
        train_features = convert_pers_to_features(train_examples, label_list, max_seq_length, tokenizer)
        logger.info("perspective features done")
#         opposite_claim_features = convert_opp_claims_to_features(train_examples, label_list, max_seq_length, tokenizer)
#         logger.info("opposite claim features done")
        opposite_perspective_features = convert_triopp_pers_to_features(train_examples, label_list, max_seq_length, tokenizer)
        logger.info("opp perspective features done")

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        
            
        pers_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        pers_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        pers_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        pers_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

        claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long)
        claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long)
        claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long)
        claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long)
        
        
        opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_perspective_features], dtype=torch.long)
        opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_perspective_features], dtype=torch.long)
        opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_perspective_features], dtype=torch.long)
        opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_perspective_features], dtype=torch.long)
        
        
#         opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_perspective_features if f.input_ids], dtype=torch.long)
#         opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_perspective_features if f.input_mask], dtype=torch.long)
#         opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_perspective_features if f.segment_ids], dtype=torch.long)
#         opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_perspective_features if f.label_id], dtype=torch.long)

#         opp_claims_input_ids = torch.tensor([f.input_ids for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_input_mask = torch.tensor([f.input_mask for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_segment_ids = torch.tensor([f.segment_ids for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_label_ids = torch.tensor([f.label_id for f in opposite_claim_features], dtype=torch.long)
        
#         logger.info("  opp pers id: %d, opp pers mask: %d, opp pers seg: %d, opp pers label: %d, opp calims label: %d, calims label: %d ", len(opp_pers_input_ids),len(opp_pers_input_mask),len(opp_pers_segment_ids),len(opp_pers_label_ids),len(opp_claims_label_ids),len(claims_label_ids))
        
        train_data = TensorDataset(pers_input_ids, pers_input_mask, pers_segment_ids, pers_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids, opp_pers_input_ids, opp_pers_input_mask, opp_pers_segment_ids, opp_pers_label_ids)

        if local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

        model.train()

        for _ in trange(int(num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            process_bar = tqdm(train_dataloader)
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids, opp_input_ids, opp_input_mask, opp_segment_ids, opp_label_ids = batch
                
                out_results = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, labels3=opp_label_ids)
#                 loss = model(input_ids, segment_ids, input_mask, label_ids)
#                 print("out_results:")
#                 print(out_results)
                loss = out_results
            
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if fp16 and loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * loss_scale
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                process_bar.set_description("Loss: %0.8f" % (loss.sum().item()))
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16 or optimize_on_cpu:
                        if fp16 and loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / loss_scale
                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                        if is_nan:
                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                            loss_scale = loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1
            print("\nLoss: {}\n".format(tr_loss / nb_tr_steps))
        torch.save(model.state_dict(), output_dir +"with_naive_distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch15.pth")


    if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
     
    
        test_df = processor.get_test_df(data_dir)
        
#         new_test_df = generate_opp_dataset(test_df)
        
#         new_test_df.to_csv(os.path.join(data_dir, "new_test.tsv"),sep='\t',index=False)
        
        train_df = processor.get_train_df(data_dir)
        
#         new_train_df = generate_opp_dataset(train_df)
        
#         new_train_df.to_csv(os.path.join(data_dir, "new_train.tsv"),sep='\t',index=False)
        
        dev_df = processor.get_dev_df(data_dir)
        
#         new_dev_df = generate_opp_dataset(dev_df)
        
#         new_dev_df.to_csv(os.path.join(data_dir, "new_dev.tsv"),sep='\t',index=False)

        eval_examples = processor.get_test_examples(data_dir)
#         eval_examples = processor.get_train_examples(data_dir)
#         eval_examples = processor.get_dev_examples(data_dir)
        claim_features = convert_claims_to_features(eval_examples, label_list, max_seq_length, tokenizer)
        eval_features = convert_pers_to_features(eval_examples, label_list, max_seq_length, tokenizer)
        
#         opposite_claim_features = convert_opp_claims_to_features(eval_examples, label_list, max_seq_length, tokenizer)
        opposite_eval_features = convert_triopp_pers_to_features(eval_examples, label_list, max_seq_length, tokenizer)
            
    
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", eval_batch_size)
        
        pers_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        pers_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        pers_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        pers_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        
        claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long)
        claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long)
        claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long)
        claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long)
        
        opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_eval_features], dtype=torch.long)
        opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_eval_features], dtype=torch.long)
        opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_eval_features], dtype=torch.long)
        opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_eval_features], dtype=torch.long)
        
#         opp_claims_input_ids = torch.tensor([f.input_ids for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_input_mask = torch.tensor([f.input_mask for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_segment_ids = torch.tensor([f.segment_ids for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_label_ids = torch.tensor([f.label_id for f in opposite_claim_features], dtype=torch.long)
        
#         logger.info("%d%d%d%d", len(pers_input_ids),len(claims_input_ids),len(opp_pers_input_ids),len(opp_claims_input_ids))
        
        eval_data = TensorDataset(pers_input_ids, pers_input_mask, pers_segment_ids, pers_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids, opp_pers_input_ids, opp_pers_input_mask, opp_pers_segment_ids, opp_pers_label_ids)
        
#         logger.info(eval_data)
        # Run prediction for full data
#         eval_sampler = SequentialSampler(eval_data)
        eval_sampler = SequentialSampler(eval_data)
#         logger.info("1")
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)
#         print('all_input_ids:')
#         print(all_input_ids)
#         logger.info("2")
        

#         model.load_state_dict(torch.load(saved_model))
        model_state_dict = torch.load(saved_model)
#         logger.info("3")
        model = BertForConsistencyCueClassification.from_pretrained('bert-base-uncased', num_labels=2, state_dict=model_state_dict)
#         logger.info("4")
        model.to(device)
#         logger.info("5")
        
        model.eval()
#         logger.info("6")
        # eval_loss, eval_accuracy = 0, 0

        eval_tp, eval_pred_c, eval_gold_c = 0, 0, 0
        distance_eval_tp, distance_eval_pred_c, distance_eval_gold_c = 0, 0, 0
        eval_loss, eval_accuracy, eval_macro_p, eval_macro_r = 0, 0, 0, 0
        distance_accuracy, distance_eval_macro_p, distance_eval_macro_r = 0, 0, 0

        raw_score = []
        predicted_labels = []
        distance_labels = []
        predicted_prob = []
        gold_labels = []

        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids, opp_input_ids, opp_input_mask, opp_segment_ids, opp_label_ids in eval_dataloader:
            
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
            
            claim_input_ids = claim_input_ids.to(device)
            claim_input_mask = claim_input_mask.to(device)
            claim_segment_ids = claim_segment_ids.to(device)
            claim_label_ids = claim_label_ids.to(device)
            
            opp_input_ids = opp_input_ids.to(device)
            opp_input_mask = opp_input_mask.to(device)
            opp_segment_ids = opp_segment_ids.to(device)
            opp_label_ids = opp_label_ids.to(device)
            
#             opp_claim_input_ids = opp_claim_input_ids.to(device)
#             opp_claim_input_mask = opp_claim_input_mask.to(device)
#             opp_claim_segment_ids = opp_claim_segment_ids.to(device)
#             opp_claim_label_ids = opp_claim_label_ids.to(device)

#             print("start")
#             print(input_ids)
#             print(input_mask)
#             print(segment_ids)
#             print(label_ids)
#             print(claim_input_ids)
#             print(claim_input_mask)
#             print(claim_segment_ids)
#             print(claim_label_ids)
#             print("end")
            with torch.no_grad():
                tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, labels3=opp_label_ids)
                
                logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask)[0]
                
                distance_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask)[1]
                
#                 predicted_prob.extend(torch.nn.functional.softmax(logits, dim=1))
#                 logits_grid = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, input_ids4=opp_claim_input_ids, token_type_ids4=opp_claim_segment_ids, attention_mask4=opp_claim_input_mask)
            
#             print(logits)
#             print(logits[0])
            logits = logits.detach().cpu().numpy()
            distance_logits = distance_logits.detach().cpu().numpy()
#             print(logits)
            label_ids = label_ids.to('cpu').numpy()
#             print(label_ids)

            tmp_eval_accuracy = accuracy(logits, label_ids)
            distance_eval_accuracy = accuracy(distance_logits, label_ids)
            
            tmp_predicted = np.argmax(logits, axis=1)
            distance_predicted = np.argmax(distance_logits, axis=1)
            
            predicted_labels.extend(tmp_predicted.tolist())
            distance_labels.extend(distance_predicted.tolist())
            gold_labels.extend(label_ids.tolist())
            
            # Micro F1 (aggregated tp, fp, fn counts across all examples)
            tmp_tp, tmp_pred_c, tmp_gold_c = tp_pcount_gcount(logits, label_ids)
            eval_tp += tmp_tp
            eval_pred_c += tmp_pred_c
            eval_gold_c += tmp_gold_c
            
            distance_tp, distance_pred_c, distance_gold_c = tp_pcount_gcount(distance_logits, label_ids)
            distance_eval_tp += distance_tp
            distance_eval_pred_c += distance_pred_c
            distance_eval_gold_c += distance_gold_c
            
            pred_label = np.argmax(logits, axis=1)
            distance_label = np.argmax(distance_logits, axis=1)
            
            raw_score += zip(logits, distance_logits, pred_label, distance_label, label_ids)
            
            # Macro F1 (averaged P, R across mini batches)
            tmp_eval_p, tmp_eval_r, tmp_eval_f1 = p_r_f1(logits, label_ids)

            eval_macro_p += tmp_eval_p
            eval_macro_r += tmp_eval_r
            
            distance_eval_p, distance_eval_r, distance_eval_f1 = p_r_f1(distance_logits, label_ids)

            distance_eval_macro_p += distance_eval_p
            distance_eval_macro_r += distance_eval_r

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy
            distance_accuracy += distance_eval_accuracy
            
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1


        # Micro F1 (aggregated tp, fp, fn counts across all examples)
        eval_micro_p = eval_tp / eval_pred_c
        eval_micro_r = eval_tp / eval_gold_c
        eval_micro_f1 = 2 * eval_micro_p * eval_micro_r / (eval_micro_p + eval_micro_r)
        
        distance_eval_micro_p = distance_eval_tp / distance_eval_pred_c
        distance_eval_micro_r = distance_eval_tp / distance_eval_gold_c
        distance_eval_micro_f1 = 2 * distance_eval_micro_p * distance_eval_micro_r / (distance_eval_micro_p + distance_eval_micro_r)

        # Macro F1 (averaged P, R across mini batches)
        eval_macro_p = eval_macro_p / nb_eval_steps
        eval_macro_r = eval_macro_r / nb_eval_steps
        eval_macro_f1 = 2 * eval_macro_p * eval_macro_r / (eval_macro_p + eval_macro_r)
        
        distance_eval_macro_p = distance_eval_macro_p / nb_eval_steps
        distance_eval_macro_r = distance_eval_macro_r / nb_eval_steps
        distance_eval_macro_f1 = 2 * distance_eval_macro_p * distance_eval_macro_r / (distance_eval_macro_p + distance_eval_macro_r)

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        distance_accuracy = distance_accuracy / nb_eval_examples
#         print("\nLoss: {}\n".format(eval_loss / nb_eval_steps))
        result = {
                  'eval_loss': eval_loss,
                  'eval_accuracy':eval_accuracy,
                  'eval_micro_p': eval_micro_p,
                  'eval_micro_r': eval_micro_r,
                  'eval_micro_f1': eval_micro_f1,
                  'eval_macro_p': eval_macro_p,
                  'eval_macro_r': eval_macro_r,
                  'eval_macro_f1': eval_macro_f1,
            
                  'distance_accuracy':distance_accuracy,
                  'distance_eval_micro_p': distance_eval_micro_p,
                  'distance_eval_micro_r': distance_eval_micro_r,
                  'distance_eval_micro_f1': distance_eval_micro_f1,
                  'distance_eval_macro_p': distance_eval_macro_p,
                  'distance_eval_macro_r': distance_eval_macro_r,
                  'distance_eval_macro_f1': distance_eval_macro_f1
#                   'global_step': global_step,
#                   'loss': tr_loss/nb_tr_steps
                  }

        output_eval_file = os.path.join(output_dir,"time_pers_distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch25_eval_results.txt")
        output_raw_score = os.path.join(output_dir,"time_pers_distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch25_raw_score.csv")
        
#         logger.info(classification_report(gold_labels, predicted_labels, target_names=label_list, digits=4))
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
#             writer.write(classification_report(gold_labels, predicted_labels, target_names=label_list, digits=4))


        with open(output_raw_score, 'w') as fout:
            fields = ["undermine_score", "support_score", "cp_distance", "cop_distance", "predict_label", "distance_label", "gold"]
            writer = csv.DictWriter(fout, fieldnames=fields)
            writer.writeheader()
            for score, distance, pred, distance_pred, gold in raw_score:
                writer.writerow({
                    "undermine_score": str(score[0]),
                    "support_score": str(score[1]),
                    "cp_distance": str(distance[0]),
                    "cop_distance": str(distance[1]),
                    "predict_label": str(pred),
                    "distance_label": str(distance_pred),
                    "gold": str(gold)
                })
Exemplo n.º 3
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    tb_writer = SummaryWriter("./runs2/distilbert_align/")

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    warm_up_steps = int(args.warmup_steps * t_total)
    save_steps = int(args.save_steps * t_total)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    a = []
    b = []
    c = []
    d = []
    optimizer_grouped_parameters = []
    for n, p in model.named_parameters():
        if 'classifier' in n or 'linear_r' in n or 'linear_g' in n:
            if any(nd in n for nd in no_decay):
                a.append(p)
            else:
                b.append(p)
        else:
            if any(nd in n for nd in no_decay):
                c.append(p)
            else:
                d.append(p)
    optimizer_grouped_parameters.append({
        "params": a,
        "weight_decay": 0,
        "lr": 2e-3
    })
    optimizer_grouped_parameters.append({
        "params": b,
        "weight_decay": args.weight_decay,
        "lr": 2e-3
    })
    optimizer_grouped_parameters.append({"params": c, "weight_decay": 0})
    optimizer_grouped_parameters.append({
        "params": d,
        "weight_decay": args.weight_decay
    })
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warm_up_steps,
                                     t_total=t_total)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'align_mask': batch[2],
                'labels': batch[4]
            }
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    tb_writer.add_scalar('lr_n',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('lr_o',
                                         scheduler.get_lr()[2], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if save_steps > 0 and global_step % save_steps == 0:
                    # Save model checkpoint
                    if args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    tb_writer.close()

    return global_step, tr_loss / global_step
Exemplo n.º 4
0
def provider(
    data_folder,
    df_path,
    phase,
    fold=0,
    mean=None,
    std=None,
    batch_size=8,
    num_workers=4,
):
    '''Returns dataloader for the model training'''
    df = pd.read_csv(df_path)
    # some preprocessing
    # https://www.kaggle.com/amanooo/defect-detection-starter-u-net
    df['ImageId'], df['ClassId'] = zip(*df['ImageId_ClassId'].str.split('_'))
    df['ClassId'] = df['ClassId'].astype(int)
    df = df.pivot(index='ImageId', columns='ClassId', values='EncodedPixels')
    df['defects'] = df.count(axis=1)

    #train_df, val_df = train_test_split(df, test_size=0.1, stratify=df["defects"])
    #TODO:split:
    #>>>>>>>>>>
    num_data = df.shape[0]
    df['group'] = np.random.randint(5, size=num_data)
    train_df = df[df['group'] != fold]
    val_df = df[df['group'] == fold]

    # # added by zhang ge :when training, delete rows that have no RLE data
    df = train_df if phase == "train" else val_df  #[train_df['defects']>0]
    image_dataset = TrainvalDataset(df, data_folder, mean, std, phase)

    #define sampler:weights of each class:[2.2,14,52,2.5,16]
    resample = False  #<<<<<<<<<<<<<<<<<<<<<<< TODO: SET RESAMPLE
    if phase == 'train' and resample:
        print("Defining Sampler......")

        class_weights = torch.Tensor([2.2, 14, 52, 2.5, 16]).cuda()  #to gpu
        #class_weights = torch.sqrt(class_weights)
        print(class_weights)

        size = len(image_dataset)
        sample_targets = torch.zeros(size, dtype=torch.int64).cuda()  # to gpu
        for idx in tqdm(range(size), total=size):
            _, mask = make_mask(idx, df)  #[256,1600,4]
            sum_cls = mask.sum(0).sum(0)
            if (sum_cls.sum() == 0):  #bg
                sample_targets[idx] = 0
            else:  #fg
                #select order: 2,4,1,3->sum_cls:1,3,0,2
                if sum_cls[1] != 0: sample_targets[idx] = 2
                elif sum_cls[3] != 0: sample_targets[idx] = 4
                elif sum_cls[0] != 0: sample_targets[idx] = 1
                elif sum_cls[2] != 0: sample_targets[idx] = 3
        sample_weights = class_weights[sample_targets]
        assert sample_weights.shape[0] == size
        sampler = WeightedRandomSampler(weights=sample_weights,
                                        num_samples=size)

    elif phase == 'train' and not resample:
        sampler = RandomSampler(df)
    else:
        #phase != 'train'
        sampler = SequentialSampler(df)

    dataloader = DataLoader(
        image_dataset,
        batch_size=batch_size,
        sampler=sampler,
        num_workers=num_workers,
        pin_memory=True,
        shuffle=False,
    )

    return dataloader
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--output",
                        default=None,
                        type=str,
                        required=True,
                        help="Path to processed datasets and export files")
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default="ner",
                        type=str,
                        help="The name of the task to train.")

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_false',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    args.data_dir = os.path.join(args.output, 'nerdata')
    args.output_dir = os.path.join(args.output, 'nermodel')

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {"ner": NerProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))
    model = BertForTokenClassification.from_pretrained(args.bert_model,
                                                       cache_dir=cache_dir,
                                                       num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        model_config = {
            "bert_model": args.bert_model,
            "do_lower": args.do_lower_case,
            "max_seq_length": args.max_seq_length,
            "num_labels": len(label_list) + 1,
            "label_map": label_map
        }
        json.dump(
            model_config,
            open(os.path.join(args.output_dir, "model_config.json"), "w"))
        # Load a trained model and config that you have fine-tuned
    else:
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        config = BertConfig(output_config_file)
        model = BertForTokenClassification(config, num_labels=num_labels)
        model.load_state_dict(torch.load(output_model_file))

    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask)

            logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            input_mask = input_mask.to('cpu').numpy()
            for i, mask in enumerate(input_mask):
                temp_1 = []
                temp_2 = []
                for j, m in enumerate(mask):
                    if j == 0:
                        continue
                    if m:
                        if label_map[label_ids[i][j]] != "X":
                            temp_1.append(label_map[label_ids[i][j]])
                            temp_2.append(label_map[logits[i][j]])
                    else:
                        temp_1.pop()
                        temp_2.pop()
                        break
                y_true.append(temp_1)
                y_pred.append(temp_2)
        report = classification_report(y_true, y_pred, digits=4)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)
Exemplo n.º 6
0
def main(config):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_dataset = CNN_LSTM_Triplet_Dataset(config)
    print(len(train_dataset))
    train_sampler = RandomSampler(train_dataset)
    train_loader = DataLoader(train_dataset,
                              batch_size=config["model"]["batch_size"],
                              sampler=train_sampler)

    model = CNN_LSTM(config).double().to(device=device)

    # if True:
    #     model.load_state_dict(torch.load("./best_model_eval/model_acc_83.3276.pth"))

    # optimizer = torch.optim.Adam(lr=1e-5, betas=(0.9, 0.98), eps=1e-9)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]

    optimizer = torch.optim.Adam(lr=1e-2,
                                 betas=(0.9, 0.98),
                                 eps=1e-9,
                                 params=optimizer_grouped_parameters)
    loss_fn = nn.TripletMarginLoss()

    acc_dev_previous = 0
    loss_dev_previous = 1000
    loss_train = 1000

    for epoch in range(1, config["model"]["epoch"]):
        print(f"Training epoch {str(epoch)}")

        loss_train = train(train_loader, model, optimizer, loss_fn, loss_train,
                           epoch, device)

        print(f"Evaluate model.............")

        acc_dev, loss_dev = evaluate(dev_loader, model, loss_fn, epoch,
                                     len(dev_dataset), device)

        print(f"Accuracy score: {acc_dev:.4f} at epoch {epoch}")
        print(f"Loss Dev: {loss_dev:.4f} at epoch {epoch}")
        print("=" * 15, f"END EPOCH {epoch}", "=" * 15)

        if acc_dev > acc_dev_previous:
            acc_dev_previous = acc_dev
            torch.save(model.state_dict(),
                       f"./best_model_eval/model_acc_{round(acc_dev, 4)}.pth")
        if loss_dev < loss_dev_previous:
            loss_dev_previous = loss_dev
            torch.save(
                model.state_dict(),
                f"./best_model_eval/model_loss_{round(int(loss_dev), 4)}.pth")
Exemplo n.º 7
0
def probe(args: Namespace, probing_model: BaseDecoder,
          tokenizer: AutoTokenizer, dataset_args, layer: int, vocab):
    write_to_execution_log(100 * '+' +
                           '\t Probing layer: {} \t'.format(layer) + 100 * '+',
                           append_newlines=True,
                           path=args.execution_log)
    print(
        '##################################      Layer: {}      #########################################'
        .format(layer))

    layer_data = {}

    google_re_metrices = []
    trex_metrices = []

    my_collate = functools.partial(probing_model.cloze_collate,
                                   tokenizer=tokenizer)

    print(
        '$$$$$$$$$$$$$$$$$$$$$$$    Probing model of type: {}      $$$$$$$$$$$$$$$$$$$$$$$$'
        .format(args.model_type))
    for ele in dataset_args:
        ds_name, relation_args_list = ele

        layer_data[ds_name] = {}
        layer_data[ds_name]['means'] = []

        print(
            '*****************   {}   **********************'.format(ds_name))

        for relation_args in relation_args_list:
            relation_args = DotMap(relation_args)
            args.relation_args = relation_args
            print('---------------- {} ----------------------'.format(
                args.relation_args.template))
            print(stringify_dotmap(args.relation_args))

            layer_data[ds_name][args.relation_args.relation] = []

            dataset = ClozeDataset(probing_model,
                                   tokenizer,
                                   args,
                                   vocab,
                                   tokenizer.model_max_length,
                                   output_debug_info=False)

            # Create dataloader
            sampler = RandomSampler(dataset)
            dataloader = DataLoader(dataset,
                                    sampler=sampler,
                                    batch_size=args.probing_batch_size,
                                    collate_fn=my_collate)

            metrics_elements = []

            for _, batch in enumerate(tqdm(dataloader)):
                metrics_elements_from_batch = probing_model.probe(
                    batch, layer=layer, relation_args=relation_args)
                metrics_elements.extend(metrics_elements_from_batch)

                gc.collect()

            print('Number metrics elements: {}'.format(len(metrics_elements)))
            aggregated_metrics = aggregate_metrics_elements(metrics_elements)
            print('Aggregated: {}'.format(aggregated_metrics['P_AT_1']))

            if ds_name == 'Google_RE':
                google_re_metrices.append(aggregated_metrics)
            elif ds_name == 'TREx':
                trex_metrices.append(aggregated_metrics)

            layer_data[ds_name][args.relation_args.relation].append(
                aggregated_metrics)
            write_to_execution_log(ds_name + ': ' +
                                   args.relation_args.relation + '\t' +
                                   str(aggregated_metrics['P_AT_1']),
                                   append_newlines=True,
                                   path=args.execution_log)

    # Write results to logfile
    if len(google_re_metrices) > 0:
        write_to_execution_log('\n\nGoogle_RE: ' +
                               mean_precisions(google_re_metrices),
                               append_newlines=True,
                               path=args.execution_log)
        layer_data['Google_RE']['means'].append(
            mean_precisions(google_re_metrices))
    if len(trex_metrices) > 0:
        write_to_execution_log('Trex: ' + mean_precisions(trex_metrices),
                               append_newlines=True,
                               path=args.execution_log)
        layer_data['TREx']['means'].append(mean_precisions(trex_metrices))
    write_to_execution_log(220 * '-',
                           append_newlines=True,
                           path=args.execution_log)

    if args.use_wandb_logging:
        wandb.init(name=args.wandb_run_name,
                   project=args.wandb_project_name,
                   settings=wandb.Settings(start_method='thread'))
        wandb_log_metrics(layer_data, layer)

    return layer_data
def run(datasets, seed=42):
    ## META VARIABLES
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    epochs = 20
    batch_size = 32

    X_train, mask_train, y_train = load_data(datasets['train'], num_sample=int(625*0.7))
    X_val, mask_val, y_val = load_data(datasets['val'], num_sample=int(625*0.1))
    X_test, mask_test, y_test = load_data(datasets['test'], num_sample=int(625*0.2))


    train_dataset = TensorDataset(X_train, mask_train, y_train)
    val_dataset = TensorDataset(X_val, mask_val, y_val)
    test_dataset = TensorDataset(X_test, mask_test, y_test)
    
    # Dataloading
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
    validation_dataloader = DataLoader(val_dataset, sampler=RandomSampler(val_dataset), batch_size=batch_size)
    prediction_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=prediction_sampler, batch_size=batch_size)

    model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels = 3).to(device)
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

    total_t0 = time.time()
    best_state_dict = None
    best_val = 0
    for epoch_i in range(epochs):
    
        # ========================================
        #               Training
        # ========================================
        
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()
        total_train_loss = 0

        model.train()

        predictions_train = np.array([])
        true_label_train = np.array([])
        for step, batch in enumerate(train_dataloader):

            # Progress update every 50 batches.
            if step % 50 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()        
            loss, logits = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)

            total_train_loss += loss.item() * b_labels.shape[0]
            loss.backward()
            optimizer.step()
            scheduler.step()

            gc.collect()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            predictions_train = np.append(predictions_train, np.argmax(logits, axis=1).flatten())
            true_label_train = np.append(true_label_train, label_ids)
            
       
        # Calculate the average loss over all of the batches.
        accuracy_train = np.sum(predictions_train == true_label_train) / true_label_train.shape[0]
        f1_macro_train = f1_score(true_label_train, predictions_train, average='macro')
        f1_micro_train = f1_score(true_label_train, predictions_train, average='micro')
        print("\n  Training Accuracy: {0:.2f}".format(accuracy_train))
        print("  Training F1-MACRO: {0:.2f}".format(f1_macro_train))
        print("  Training F1-MICRO: {0:.2f}".format(f1_micro_train))

        avg_train_loss = total_train_loss / true_label_train.shape[0]        
        training_time = format_time(time.time() - t0)
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))
            
        # ========================================
        #               Validation
        # ========================================

        print("\nRunning Validation...")

        t0 = time.time()
        model.eval()

        total_val_loss = 0
        predictions_val = np.array([])
        true_label_val = np.array([])
        for batch in validation_dataloader:            
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            with torch.no_grad():        
                (loss, logits) = model(b_input_ids, 
                                    token_type_ids=None, 
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
                
            total_val_loss += loss.item() * b_labels.shape[0]

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            predictions_val = np.append(predictions_val, np.argmax(logits, axis=1).flatten())
            true_label_val = np.append(true_label_val, label_ids)

        accuracy_val = np.sum(predictions_val == true_label_val) / true_label_val.shape[0]
        f1_macro_val = f1_score(true_label_val, predictions_val, average='macro')
        f1_micro_val = f1_score(true_label_val, predictions_val, average='micro')
        print("  Accuracy: {0:.2f}".format(accuracy_val))
        print("  F1-MACRO: {0:.2f}".format(f1_macro_val))
        print("  F1-MICRO: {0:.2f}".format(f1_micro_val))

        performance_metric = f1_macro_val

        if performance_metric > best_val:
            print("Best Model Updated.")
            best_val = performance_metric
            best_state_dict = model.state_dict()

        avg_val_loss = total_val_loss / true_label_val.shape[0]
        validation_time = format_time(time.time() - t0)
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

    print("\nTraining complete!")
    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

    # ========================================
    #               Test
    # ========================================

    model.load_state_dict(best_state_dict)
    model.eval()

    predictions_test = np.array([])
    true_label_test = np.array([])
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, 
                            attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions_test = np.append(predictions_test, np.argmax(logits, axis=1).flatten())
        true_label_test = np.append(true_label_test, label_ids)
    
    best_accr = np.sum(predictions_test == true_label_test) / true_label_test.shape[0]
    best_macro_f1 = f1_score(true_label_test, predictions_test, average='macro')
    best_micro_f1 = f1_score(true_label_test, predictions_test, average='micro')
    best_confusion_matrix = confusion_matrix(true_label_test, predictions_test)
    print("  Test Accuracy: {0:.2f}".format(best_accr))
    print("  Test F1-MACRO: {0:.2f}".format(best_macro_f1))
    print("  Test F1-MICRO: {0:.2f}".format(best_micro_f1))



    # ========================================
    #              Dummy Test
    # ========================================

    X_train = X_train.detach().cpu().numpy()
    X_test = X_test.detach().cpu().numpy()
    y_train = y_train.detach().cpu().numpy().squeeze(1)
    y_test = y_test.detach().cpu().numpy().squeeze(1)
    dummy_clf = DummyClassifier(strategy="uniform")
    dummy_clf.fit(X_train, y_train)
    predictions_dummy = dummy_clf.predict(X_test)

    dummy_accr = np.sum(predictions_dummy == y_test) / y_test.shape[0]
    dummy_macro_f1 = f1_score(y_test, predictions_dummy, average='macro')
    dummy_micro_f1 = f1_score(y_test, predictions_dummy, average='micro')
    print("  Dummy Accuracy: {0:.2f}".format(dummy_accr))
    print("  Dummy F1-MACRO: {0:.2f}".format(dummy_macro_f1))
    print("  Dummy F1-MICRO: {0:.2f}".format(dummy_micro_f1))
    
    # torch.save(best_state_dict, './output/best_model_kaushik_sample.pt')

    return {
        'seed': seed,
        'best_accr': best_accr,
        'best_macro_f1': best_macro_f1,
        'best_micro_f1': best_micro_f1,
        'dummy_accr': dummy_accr,
        'dummy_macro_f1': dummy_macro_f1,
        'dummy_micro_f1': dummy_micro_f1
    }
Exemplo n.º 9
0
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples):
        model.eval()
        examples_src, examples_tgt, examples_srctgt, examples_tgtsrc, langid_srctgt, langid_tgtsrc, psi_examples_srctgt, psi_labels = [], [], [], [], [], [], [], []
        src_len = tgt_len = 0
        bpe2word_map_src, bpe2word_map_tgt = [], []
        for example in examples:
            end_id = example[0][0][-1].view(-1)

            src_id = example[0][0][:args.block_size]
            src_id = torch.cat([src_id[:-1], end_id])
            tgt_id = example[1][0][:args.block_size]
            tgt_id = torch.cat([tgt_id[:-1], end_id])

            half_block_size = int(args.block_size / 2)
            half_src_id = example[0][0][:half_block_size]
            half_src_id = torch.cat([half_src_id[:-1], end_id])
            half_tgt_id = example[1][0][:half_block_size]
            half_tgt_id = torch.cat([half_tgt_id[:-1], end_id])

            examples_src.append(src_id)
            examples_tgt.append(tgt_id)
            src_len = max(src_len, len(src_id))
            tgt_len = max(tgt_len, len(tgt_id))

            srctgt = torch.cat([half_src_id, half_tgt_id])
            langid = torch.cat([
                torch.ones_like(half_src_id),
                torch.ones_like(half_tgt_id) * 2
            ])
            examples_srctgt.append(srctgt)
            langid_srctgt.append(langid)

            tgtsrc = torch.cat([half_tgt_id, half_src_id])
            langid = torch.cat([
                torch.ones_like(half_tgt_id),
                torch.ones_like(half_src_id) * 2
            ])
            examples_tgtsrc.append(tgtsrc)
            langid_tgtsrc.append(langid)

            # [neg, neg] pair
            neg_half_src_id = example[-2][0][:half_block_size]
            neg_half_src_id = torch.cat([neg_half_src_id[:-1], end_id])
            neg_half_tgt_id = example[-1][0][:half_block_size]
            neg_half_tgt_id = torch.cat([neg_half_tgt_id[:-1], end_id])
            if random.random() > 0.5:
                neg_srctgt = torch.cat([neg_half_src_id, neg_half_tgt_id])
            else:
                neg_srctgt = torch.cat([neg_half_tgt_id, neg_half_src_id])
            psi_examples_srctgt.append(neg_srctgt)
            psi_labels.append(1)

            # [pos, neg] pair
            rd = random.random()
            if rd > 0.75:
                neg_srctgt = torch.cat([half_src_id, neg_half_tgt_id])
            elif rd > 0.5:
                neg_srctgt = torch.cat([neg_half_src_id, half_tgt_id])
            elif rd > 0.25:
                neg_srctgt = torch.cat([half_tgt_id, neg_half_src_id])
            else:
                neg_srctgt = torch.cat([neg_half_tgt_id, half_src_id])
            psi_examples_srctgt.append(neg_srctgt)
            psi_labels.append(0)

            bpe2word_map_src.append(example[2])
            bpe2word_map_tgt.append(example[3])

        examples_src = pad_sequence(examples_src,
                                    batch_first=True,
                                    padding_value=tokenizer.pad_token_id)
        examples_tgt = pad_sequence(examples_tgt,
                                    batch_first=True,
                                    padding_value=tokenizer.pad_token_id)
        examples_srctgt = pad_sequence(examples_srctgt,
                                       batch_first=True,
                                       padding_value=tokenizer.pad_token_id)
        langid_srctgt = pad_sequence(langid_srctgt,
                                     batch_first=True,
                                     padding_value=tokenizer.pad_token_id)
        examples_tgtsrc = pad_sequence(examples_tgtsrc,
                                       batch_first=True,
                                       padding_value=tokenizer.pad_token_id)
        langid_tgtsrc = pad_sequence(langid_tgtsrc,
                                     batch_first=True,
                                     padding_value=tokenizer.pad_token_id)
        psi_examples_srctgt = pad_sequence(
            psi_examples_srctgt,
            batch_first=True,
            padding_value=tokenizer.pad_token_id)
        psi_labels = torch.tensor(psi_labels)
        guides = model.get_aligned_word(
            examples_src,
            examples_tgt,
            bpe2word_map_src,
            bpe2word_map_tgt,
            args.device,
            src_len,
            tgt_len,
            align_layer=args.align_layer,
            extraction=args.extraction,
            softmax_threshold=args.softmax_threshold)
        return examples_src, examples_tgt, guides, examples_srctgt, langid_srctgt, examples_tgtsrc, langid_tgtsrc, psi_examples_srctgt, psi_labels

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)

    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs
    if args.max_steps > 0 and args.max_steps < t_total:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if (not (any(nd in n for nd in no_decay)))
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if ((any(nd in n for nd in no_decay)))
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    # Check if continuing training from a checkpoint
    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    set_seed(args)  # Added here for reproducibility

    def backward_loss(loss, tot_loss):
        if args.n_gpu > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        tot_loss += loss.item()
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        return tot_loss

    tqdm_iterator = trange(int(t_total),
                           desc="Iteration",
                           disable=args.local_rank not in [-1, 0])
    for _ in range(int(args.num_train_epochs)):
        for step, batch in enumerate(train_dataloader):

            model.train()

            if args.train_so or args.train_co:
                inputs_src, inputs_tgt = batch[0].clone(), batch[1].clone()
                inputs_src, inputs_tgt = inputs_src.to(
                    args.device), inputs_tgt.to(args.device)
                attention_mask_src, attention_mask_tgt = (inputs_src !=
                                                          0), (inputs_tgt != 0)
                guide = batch[2].to(args.device)
                loss = model(inputs_src=inputs_src,
                             inputs_tgt=inputs_tgt,
                             attention_mask_src=attention_mask_src,
                             attention_mask_tgt=attention_mask_tgt,
                             guide=guide,
                             align_layer=args.align_layer,
                             extraction=args.extraction,
                             softmax_threshold=args.softmax_threshold,
                             train_so=args.train_so,
                             train_co=args.train_co)
                tr_loss = backward_loss(loss, tr_loss)

            if args.train_mlm:
                inputs_src, labels_src = mask_tokens(batch[0], tokenizer, args)
                inputs_tgt, labels_tgt = mask_tokens(batch[1], tokenizer, args)
                inputs_src, inputs_tgt = inputs_src.to(
                    args.device), inputs_tgt.to(args.device)
                labels_src, labels_tgt = labels_src.to(
                    args.device), labels_tgt.to(args.device)
                loss = model(inputs_src=inputs_src, labels_src=labels_src)
                tr_loss = backward_loss(loss, tr_loss)

                loss = model(inputs_src=inputs_tgt, labels_src=labels_tgt)
                tr_loss = backward_loss(loss, tr_loss)

            if args.train_tlm:
                rand_ids = [0, 1]
                if not args.train_tlm_full:
                    rand_ids = [int(random.random() > 0.5)]
                for rand_id in rand_ids:
                    select_srctgt = batch[int(3 + rand_id * 2)]
                    select_langid = batch[int(4 + rand_id * 2)]
                    for lang_id in [1, 2]:
                        inputs_srctgt, labels_srctgt = mask_tokens(
                            select_srctgt, tokenizer, args, select_langid,
                            lang_id)
                        inputs_srctgt, labels_srctgt = inputs_srctgt.to(
                            args.device), labels_srctgt.to(args.device)
                        loss = model(inputs_src=inputs_srctgt,
                                     labels_src=labels_srctgt)
                        tr_loss = backward_loss(loss, tr_loss)

            if args.train_psi:
                loss = model(inputs_src=batch[7].to(args.device),
                             labels_psi=batch[8].to(args.device),
                             align_layer=args.align_layer + 1)
                tr_loss = backward_loss(loss, tr_loss)

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                tqdm_iterator.update()

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logger.info(
                        "  Step %s. Training loss = %s", str(global_step),
                        str((tr_loss - logging_loss) / args.logging_steps))
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if global_step > t_total:
                break
        if global_step > t_total:
            break

    return global_step, tr_loss / global_step
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--model_name",
                        default='GBert-predict',
                        type=str,
                        required=False,
                        help="model name")
    parser.add_argument("--data_dir",
                        default='./data/data_v1',
                        type=str,
                        required=False,
                        help="The input data dir.")
    parser.add_argument("--pretrain_dir",
                        default='./saved/GBert-pretraining',
                        type=str,
                        required=False,
                        help="pretraining model")
    parser.add_argument("--train_file",
                        default='data-multi-visit.pkl',
                        type=str,
                        required=False,
                        help="training data file.")
    parser.add_argument(
        "--output_dir",
        default='./saved/',
        type=str,
        required=False,
        help="The output directory where the model checkpoints will be written."
    )

    # Other parameters
    parser.add_argument("--use_pretrain",
                        default=True,
                        action='store_true',
                        help="is use pretrain")
    parser.add_argument("--graph",
                        default=True,
                        action='store_true',
                        help="if use ontology embedding")
    parser.add_argument("--therhold",
                        default=0.25,
                        type=float,
                        help="therhold.")
    parser.add_argument(
        "--max_seq_length",
        default=55,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=True,
                        action='store_true',
                        help="Whether to run on the dev set.")
    # parser.add_argument("--do_test",
    #                     default=True,
    #                     action='store_true',
    #                     help="Whether to run on the test set.")
    parser.add_argument("--train_batch_size",
                        default=1,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=5e-4,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=50.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=1203,
                        help="random seed for initialization")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")

    args = parser.parse_args()
    args.output_dir = os.path.join(args.output_dir, args.model_name)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    device = torch.device(
        "cuda:1" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
    #     raise ValueError(
    #         "Output directory ({}) already exists and is not empty.".format(args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    print("Loading Dataset")
    tokenizer, (train_dataset, eval_dataset) = load_dataset(args)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=1)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=SequentialSampler(eval_dataset),
                                 batch_size=1)

    print('Loading Model: ' + args.model_name)
    # config = BertConfig(vocab_size_or_config_json_file=len(tokenizer.vocab.word2idx), side_len=train_dataset.side_len)
    # config.graph = args.graph
    # model = SeperateBertTransModel(config, tokenizer.dx_voc, tokenizer.rx_voc)
    if args.use_pretrain:
        logger.info("Use Pretraining model")
        model = GBERT_Predict.from_pretrained(args.pretrain_dir,
                                              tokenizer=tokenizer)
    else:
        config = BertConfig(
            vocab_size_or_config_json_file=len(tokenizer.vocab.word2idx))
        config.graph = args.graph
        model = GBERT_Predict(config, tokenizer)
    logger.info('# of model parameters: ' + str(get_n_params(model)))

    model.to(device)

    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    rx_output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")

    # Prepare optimizer
    # num_train_optimization_steps = int(
    #     len(train_dataset) / args.train_batch_size) * args.num_train_epochs
    # param_optimizer = list(model.named_parameters())
    # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    # optimizer_grouped_parameters = [
    #     {'params': [p for n, p in param_optimizer if not any(
    #         nd in n for nd in no_decay)], 'weight_decay': 0.01},
    #     {'params': [p for n, p in param_optimizer if any(
    #         nd in n for nd in no_decay)], 'weight_decay': 0.0}
    # ]

    # optimizer = BertAdam(optimizer_grouped_parameters,
    #                      lr=args.learning_rate,
    #                      warmup=args.warmup_proportion,
    #                      t_total=num_train_optimization_steps)
    optimizer = Adam(model.parameters(), lr=args.learning_rate)

    global_step = 0
    if args.do_train:
        writer = SummaryWriter(args.output_dir)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", 1)

        dx_acc_best, rx_acc_best = 0, 0
        acc_name = 'prauc'
        dx_history = {'prauc': []}
        rx_history = {'prauc': []}

        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            print('')
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            prog_iter = tqdm(train_dataloader, leave=False, desc='Training')
            model.train()
            for _, batch in enumerate(prog_iter):
                batch = tuple(t.to(device) for t in batch)
                input_ids, dx_labels, rx_labels = batch
                input_ids, dx_labels, rx_labels = input_ids.squeeze(
                    dim=0), dx_labels.squeeze(dim=0), rx_labels.squeeze(dim=0)
                loss, rx_logits = model(input_ids,
                                        dx_labels=dx_labels,
                                        rx_labels=rx_labels,
                                        epoch=global_step)
                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += 1
                nb_tr_steps += 1

                # Display loss
                prog_iter.set_postfix(loss='%.4f' % (tr_loss / nb_tr_steps))

                optimizer.step()
                optimizer.zero_grad()

            writer.add_scalar('train/loss', tr_loss / nb_tr_steps, global_step)
            global_step += 1

            if args.do_eval:
                print('')
                logger.info("***** Running eval *****")
                model.eval()
                dx_y_preds = []
                dx_y_trues = []
                rx_y_preds = []
                rx_y_trues = []
                for eval_input in tqdm(eval_dataloader, desc="Evaluating"):
                    eval_input = tuple(t.to(device) for t in eval_input)
                    input_ids, dx_labels, rx_labels = eval_input
                    input_ids, dx_labels, rx_labels = input_ids.squeeze(
                        dim=0), dx_labels.squeeze(dim=0), rx_labels.squeeze(
                            dim=0)
                    with torch.no_grad():
                        loss, rx_logits = model(input_ids,
                                                dx_labels=dx_labels,
                                                rx_labels=rx_labels)
                        rx_y_preds.append(t2n(torch.sigmoid(rx_logits)))
                        rx_y_trues.append(t2n(dx_labels))
                        # rx_y_preds.append(t2n(torch.sigmoid(rx_logits)))
                        # rx_y_trues.append(t2n(rx_labels))
                        # dx_y_preds.append(t2n(torch.sigmoid(dx_logits)))
                        # dx_y_trues.append(
                        #     t2n(dx_labels.view(-1, len(tokenizer.dx_voc.word2idx))))
                        # rx_y_preds.append(t2n(torch.sigmoid(rx_logits))[
                        #                   :, tokenizer.rx_singe2multi])
                        # rx_y_trues.append(
                        #     t2n(rx_labels)[:, tokenizer.rx_singe2multi])

                print('')
                # dx_acc_container = metric_report(np.concatenate(dx_y_preds, axis=0), np.concatenate(dx_y_trues, axis=0),
                #                                  args.therhold)
                rx_acc_container = metric_report(
                    np.concatenate(rx_y_preds, axis=0),
                    np.concatenate(rx_y_trues, axis=0), args.therhold)
                for k, v in rx_acc_container.items():
                    writer.add_scalar('eval/{}'.format(k), v, global_step)

                if rx_acc_container[acc_name] > rx_acc_best:
                    rx_acc_best = rx_acc_container[acc_name]
                    # save model
                    torch.save(model_to_save.state_dict(),
                               rx_output_model_file)

            with open(os.path.join(args.output_dir, 'metrics_log.txt'),
                      'a') as f:
                f.write(
                    "epoch{}, jaccard:{}, f1:{}, prauc:{}, auc:{}\n".format(
                        epoch, rx_acc_container['jaccard'],
                        rx_acc_container['f1'], rx_acc_container['prauc'],
                        rx_acc_container['auc']))

        with open(os.path.join(args.output_dir, 'bert_config.json'),
                  'w',
                  encoding='utf-8') as fout:
            fout.write(model.config.to_json_string())
Exemplo n.º 11
0
def main(config_file='config/bert_config.json'):
    """Main method for training.

    Args:
        config_file: in config dir
    """
    # 0. Load config and mkdir
    with open(config_file) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    get_path(os.path.join(config.model_path, config.experiment_name))
    get_path(config.log_path)
    # if config.model_type == 'rnn':  # build vocab for rnn
    #     build_vocab(file_in=config.all_train_file_path,
    #                 file_out=os.path.join(config.model_path, 'vocab.txt'))
    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type,
                config=config)
    datasets = data.load_train_and_valid_files(
        train_file=config.train_file_path, valid_file=config.valid_file_path)
    train_set, valid_set_train, valid_set_valid, train_label, valid_label = datasets
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # device = torch.device('cpu')
        # torch.distributed.init_process_group(backend="nccl")
        # sampler_train = DistributedSampler(train_set)
        sampler_train = RandomSampler(train_set)
    else:
        device = torch.device('cpu')
        sampler_train = RandomSampler(train_set)
    data_loader = {
        'train':
        DataLoader(train_set,
                   sampler=sampler_train,
                   batch_size=config.batch_size),
        'valid_train':
        DataLoader(valid_set_train,
                   batch_size=config.batch_size,
                   shuffle=False),
        'valid_valid':
        DataLoader(valid_set_valid,
                   batch_size=config.batch_size,
                   shuffle=False),
        "train_label":
        train_label,
        "valid_label":
        valid_label
    }
    # 2. Build model
    model = MODEL_MAP[config.model_type](config)
    #load model states.
    if config.trained_weight:
        model.load_state_dict(torch.load(config.trained_weight))
    model.to(device)
    if torch.cuda.is_available():
        model = model
        # model = torch.nn.parallel.DistributedDataParallel(
        #     model, find_unused_parameters=True)
    # 3. Train
    trainer = Trainer(model=model,
                      data_loader=data_loader,
                      device=device,
                      config=config)
    best_model_state_dict = trainer.train()
    # 4. Save model
    torch.save(best_model_state_dict,
               os.path.join(config.model_path, 'model.bin'))
Exemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )

    parser.add_argument("--dev_src_file",
                        default=None,
                        type=str,
                        help="The input data file name.")
    parser.add_argument("--dev_tgt_file",
                        default=None,
                        type=str,
                        help="The output data file name.")
    parser.add_argument("--dev_check_file",
                        default=None,
                        type=str,
                        help="The output style response/know data file name.")
    parser.add_argument("--dev_style_file",
                        default=None,
                        type=str,
                        help="The output style response/know data file name.")

    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--config_path",
                        default=None,
                        type=str,
                        help="Bert config file path.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--log_dir",
        default='',
        type=str,
        required=True,
        help="The output directory where the log will be written.")
    parser.add_argument("--model_recover_path",
                        default=None,
                        type=str,
                        required=True,
                        help="The file of fine-tuned pretraining model.")
    parser.add_argument("--optim_recover_path",
                        default=None,
                        type=str,
                        help="The file of pretraining optimizer.")

    parser.add_argument("--predict_bleu",
                        default=0.5,
                        type=float,
                        help="The Predicted Bleu for KS Predict ")

    parser.add_argument("--train_vae",
                        action='store_true',
                        help="Whether to train vae.")
    # Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        action='store_true',
                        help="Whether to run ks predict.")

    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--label_smoothing",
                        default=0,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="The weight decay rate for Adam.")
    parser.add_argument("--finetune_decay",
                        action='store_true',
                        help="Weight decay to the original weights.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion_step",
        default=300,
        type=int,
        help=
        "Proportion of training to perform linear learning rate warmup for. ")
    parser.add_argument("--hidden_dropout_prob",
                        default=0.1,
                        type=float,
                        help="Dropout rate for hidden states.")
    parser.add_argument("--attention_probs_dropout_prob",
                        default=0.1,
                        type=float,
                        help="Dropout rate for attention probabilities.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp32_embedding',
        action='store_true',
        help=
        "Whether to use 32-bit float precision instead of 16-bit for embeddings"
    )
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--amp',
                        action='store_true',
                        help="Whether to use amp for fp16")
    parser.add_argument(
        '--from_scratch',
        action='store_true',
        help=
        "Initialize parameters with random values (i.e., training from scratch)."
    )
    parser.add_argument('--new_segment_ids',
                        action='store_true',
                        help="Use new segment ids for bi-uni-directional LM.")
    parser.add_argument('--new_pos_ids',
                        action='store_true',
                        help="Use new position ids for LMs.")
    parser.add_argument('--tokenized_input',
                        action='store_true',
                        help="Whether the input is tokenized.")
    parser.add_argument('--max_len_a',
                        type=int,
                        default=0,
                        help="Truncate_config: maximum length of segment A.")
    parser.add_argument('--max_len_b',
                        type=int,
                        default=0,
                        help="Truncate_config: maximum length of segment B.")
    parser.add_argument(
        '--trunc_seg',
        default='',
        help="Truncate_config: first truncate segment A/B (option: a, b).")
    parser.add_argument(
        '--always_truncate_tail',
        action='store_true',
        help="Truncate_config: Whether we should always truncate tail.")
    parser.add_argument(
        "--mask_prob",
        default=0.15,
        type=float,
        help=
        "Number of prediction is sometimes less than max_pred when sequence is short."
    )
    parser.add_argument(
        "--mask_prob_eos",
        default=0,
        type=float,
        help=
        "Number of prediction is sometimes less than max_pred when sequence is short."
    )
    parser.add_argument('--max_pred',
                        type=int,
                        default=20,
                        help="Max tokens of prediction.")
    parser.add_argument("--num_workers",
                        default=0,
                        type=int,
                        help="Number of workers for the data loader.")

    parser.add_argument('--mask_source_words',
                        action='store_true',
                        help="Whether to mask source words for training")
    parser.add_argument('--skipgram_prb',
                        type=float,
                        default=0.0,
                        help='prob of ngram mask')
    parser.add_argument('--skipgram_size',
                        type=int,
                        default=1,
                        help='the max size of ngram mask')
    parser.add_argument('--mask_whole_word',
                        action='store_true',
                        help="Whether masking a whole word.")
    parser.add_argument('--do_l2r_training',
                        action='store_true',
                        help="Whether to do left to right training")
    parser.add_argument(
        '--has_sentence_oracle',
        action='store_true',
        help="Whether to have sentence level oracle for training. "
        "Only useful for summary generation")
    parser.add_argument('--max_position_embeddings',
                        type=int,
                        default=None,
                        help="max position embeddings")
    parser.add_argument('--relax_projection',
                        action='store_true',
                        help="Use different projection layers for tasks.")
    parser.add_argument('--ffn_type',
                        default=0,
                        type=int,
                        help="0: default mlp; 1: W((Wx+b) elem_prod x);")
    parser.add_argument('--num_qkv',
                        default=0,
                        type=int,
                        help="Number of different <Q,K,V>.")
    parser.add_argument('--seg_emb',
                        action='store_true',
                        help="Using segment embedding for self-attention.")
    parser.add_argument(
        '--s2s_special_token',
        action='store_true',
        help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.")
    parser.add_argument('--s2s_add_segment',
                        action='store_true',
                        help="Additional segmental for the encoder of S2S.")
    parser.add_argument(
        '--s2s_share_segment',
        action='store_true',
        help=
        "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)."
    )
    parser.add_argument('--pos_shift',
                        action='store_true',
                        help="Using position shift for fine-tuning.")

    args = parser.parse_args()

    assert Path(
        args.model_recover_path).exists(), "--model_recover_path doesn't exist"

    args.output_dir = args.output_dir.replace('[PT_OUTPUT_DIR]',
                                              os.getenv('PT_OUTPUT_DIR', ''))
    args.log_dir = args.log_dir.replace('[PT_OUTPUT_DIR]',
                                        os.getenv('PT_OUTPUT_DIR', ''))

    os.makedirs(args.output_dir, exist_ok=True)
    os.makedirs(args.log_dir, exist_ok=True)

    handler = logging.FileHandler(os.path.join(args.log_dir, "train.log"),
                                  encoding='UTF-8')
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)

    console = logging.StreamHandler()
    console.setLevel(logging.DEBUG)

    logger.addHandler(handler)
    logger.addHandler(console)

    json.dump(args.__dict__,
              open(os.path.join(args.output_dir, 'opt.json'), 'w'),
              sort_keys=True,
              indent=2)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        dist.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.local_rank not in (-1, 0):
        # Make sure only the first process in distributed training will download model & vocab
        dist.barrier()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    if args.max_position_embeddings:
        tokenizer.max_len = args.max_position_embeddings
    data_tokenizer = WhitespaceTokenizer(
    ) if args.tokenized_input else tokenizer
    if args.local_rank == 0:
        dist.barrier()

    C_bi_uni_pipeline = [
        seq2seq_loader.C_Preprocess4Seq2seq(
            args.max_pred,
            args.mask_prob,
            list(tokenizer.vocab.keys()),
            tokenizer.convert_tokens_to_ids,
            args.max_seq_length,
            new_segment_ids=args.new_segment_ids,
            truncate_config={
                'max_len_a': args.max_len_a,
                'max_len_b': args.max_len_b,
                'trunc_seg': args.trunc_seg,
                'always_truncate_tail': args.always_truncate_tail
            },
            mask_source_words=args.mask_source_words,
            skipgram_prb=args.skipgram_prb,
            skipgram_size=args.skipgram_size,
            mask_whole_word=args.mask_whole_word,
            mode="s2s",
            has_oracle=args.has_sentence_oracle,
            num_qkv=args.num_qkv,
            s2s_special_token=args.s2s_special_token,
            s2s_add_segment=args.s2s_add_segment,
            s2s_share_segment=args.s2s_share_segment,
            pos_shift=args.pos_shift)
    ]

    logger.info("Loading Dataset from {}".format(args.data_dir))

    fn_src = os.path.join(args.data_dir, args.dev_src_file)
    fn_tgt = os.path.join(args.data_dir, args.dev_tgt_file)
    dev_reddit_dataset = seq2seq_loader.C_Seq2SeqDataset(
        fn_src,
        fn_tgt,
        args.eval_batch_size,
        data_tokenizer,
        args.max_seq_length,
        file_oracle=None,
        bi_uni_pipeline=C_bi_uni_pipeline)
    if args.local_rank == -1:
        dev_reddit_sampler = RandomSampler(dev_reddit_dataset,
                                           replacement=False)
        _batch_size = args.eval_batch_size
    else:
        dev_reddit_sampler = DistributedSampler(dev_reddit_dataset)
        _batch_size = args.eval_batch_size // dist.get_world_size()
    dev_reddit_dataloader = torch.utils.data.DataLoader(
        dev_reddit_dataset,
        batch_size=_batch_size,
        sampler=dev_reddit_sampler,
        num_workers=args.num_workers,
        collate_fn=seq2seq_loader.batch_list_to_batch_tensors,
        pin_memory=False)

    # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps)

    amp_handle = None
    if args.fp16 and args.amp:
        from apex import amp
        amp_handle = amp.init(enable_caching=True)
        logger.info("enable fp16 with amp")

    # Prepare model
    recover_step = _get_max_epoch_model(args.output_dir)
    cls_num_labels = 2
    type_vocab_size = 6 + \
          (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2
    num_sentlvl_labels = 2 if args.has_sentence_oracle else 0
    relax_projection = 4 if args.relax_projection else 0
    if args.local_rank not in (-1, 0):
        # Make sure only the first process in distributed training will download model & vocab
        dist.barrier()

    if args.model_recover_path:
        logger.info("***** Recover model: %s *****", args.model_recover_path)
        model_recover = torch.load(args.model_recover_path, map_location='cpu')

    model = BertForPreTrainingLossMask.from_pretrained(
        args.bert_model,
        state_dict=model_recover,
        num_labels=cls_num_labels,
        num_rel=0,
        type_vocab_size=type_vocab_size,
        config_path=args.config_path,
        task_idx=3,
        num_sentlvl_labels=num_sentlvl_labels,
        max_position_embeddings=args.max_position_embeddings,
        label_smoothing=args.label_smoothing,
        fp32_embedding=args.fp32_embedding,
        relax_projection=relax_projection,
        new_pos_ids=args.new_pos_ids,
        ffn_type=args.ffn_type,
        hidden_dropout_prob=args.hidden_dropout_prob,
        attention_probs_dropout_prob=args.attention_probs_dropout_prob,
        num_qkv=args.num_qkv,
        seg_emb=args.seg_emb)

    if args.local_rank == 0:
        dist.barrier()

    if args.fp16:
        model.half()
        if args.fp32_embedding:
            model.bert.embeddings.word_embeddings.float()
            model.bert.embeddings.position_embeddings.float()
            model.bert.embeddings.token_type_embeddings.float()
    model.to(device)
    if args.local_rank != -1:
        try:
            from torch.nn.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("DistributedDataParallel")
        model = DDP(model,
                    device_ids=[args.local_rank],
                    output_device=args.local_rank,
                    find_unused_parameters=True)
    elif n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelImbalance(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            # from apex.optimizers import FP16_Optimizer
            from optimization_fp16 import FP16_Optimizer_State
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer_State(optimizer,
                                             dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer_State(optimizer,
                                             static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    if args.optim_recover_path is not None:
        logger.info("***** Recover optimizer from : {} *****".format(
            args.optim_recover_path))
        optim_recover = torch.load(args.optim_recover_path, map_location='cpu')
        if hasattr(optim_recover, 'state_dict'):
            optim_recover = optim_recover.state_dict()
        optimizer.load_state_dict(optim_recover)
        if args.loss_scale == 0:
            logger.info("***** Recover optimizer: dynamic_loss_scale *****")
            optimizer.dynamic_loss_scale = True

    logger.info("***** CUDA.empty_cache() *****")
    torch.cuda.empty_cache()

    if args.do_train:

        pretrain_step = -1
        logger.info("***** Running training *****")
        logger.info("  Batch size = %d", args.train_batch_size)

        model.train()
        if recover_step:
            start_epoch = recover_step + 1
        else:
            start_epoch = 1
        for i_epoch in trange(start_epoch,
                              int(args.num_train_epochs) + 1,
                              desc="Epoch",
                              disable=args.local_rank not in (-1, 0)):
            if args.local_rank != -1:
                train_sampler.set_epoch(i_epoch)

            logger.info("***** Running QKR evaling *****")
            logger.info("  Batch size = %d", args.eval_batch_size)

            if args.local_rank != -1:
                train_sampler.set_epoch(i_epoch)
            dev_iter_bar = tqdm(dev_reddit_dataloader,
                                desc='Iter (loss=X.XXX)',
                                disable=args.local_rank not in (-1, 0))

            total_lm_loss = 0
            for qkr_dev_step, batch in enumerate(dev_iter_bar):
                batch = [
                    t.to(device) if t is not None else None for t in batch
                ]
                if args.has_sentence_oracle:
                    input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, oracle_pos, oracle_weights, oracle_labels = batch
                else:
                    input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, tgt_pos, labels, ks_labels, style_ids, style_labels, check_ids = batch
                    oracle_pos, oracle_weights, oracle_labels = None, None, None

                with torch.no_grad():

                    loss_tuple = model(input_ids,
                                       segment_ids,
                                       input_mask,
                                       lm_label_ids,
                                       is_next,
                                       masked_pos=masked_pos,
                                       masked_weights=masked_weights,
                                       task_idx=task_idx,
                                       masked_pos_2=oracle_pos,
                                       masked_weights_2=oracle_weights,
                                       masked_labels_2=oracle_labels,
                                       mask_qkv=mask_qkv,
                                       tgt_pos=tgt_pos,
                                       labels=labels,
                                       ks_labels=ks_labels,
                                       train_vae=args.train_vae,
                                       style_ids=style_ids,
                                       style_labels=style_labels,
                                       check_ids=check_ids,
                                       pretrain=None)

                    masked_lm_loss, next_sentence_loss, KL_loss, Mutual_loss, Golden_loss, cosine_similarity_loss, predict_kl_loss = loss_tuple
                    if n_gpu > 1:  # mean() to average on multi-gpu.
                        masked_lm_loss = masked_lm_loss.mean()

                    # logging for each step (i.e., before normalization by args.gradient_accumulation_steps)
                    total_lm_loss += masked_lm_loss.item()

                    # ensure that accumlated gradients are normalized
                    total_mean_lm_loss = total_lm_loss / (qkr_dev_step + 1)
                    print(total_mean_lm_loss)

            logger.info("** ** * Evaling mean loss ** ** * ")
            logger.info("In{}epoch,dev_lm_loss:{}".format(
                i_epoch, total_mean_lm_loss))
            logger.info("ppl:{}".format(np.exp(total_mean_lm_loss)))
            logger.info("******************************************* ")
            break
def train(args, model, tokenizer):
    """ Fine-tune the pretrained model on the corpus. """
    set_seed(args)

    # Load the data
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_dataset = load_and_cache_examples(args, tokenizer)
    train_sampler = RandomSampler(train_dataset)
    model_collate_fn = functools.partial(collate, tokenizer=tokenizer, block_size=512)
    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        collate_fn=model_collate_fn,
    )

    # Training schedule
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = t_total // (
            len(train_dataloader) // args.gradient_accumulation_steps + 1
        )
    else:
        t_total = (
            len(train_dataloader)
            // args.gradient_accumulation_steps
            * args.num_train_epochs
        )

    # Prepare the optimizer
    lr = {"encoder": 0.002, "decoder": 0.2}
    warmup_steps = {"encoder": 20000, "decoder": 10000}
    optimizer = BertSumOptimizer(model, lr, warmup_steps)

    # Train
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info(
        "  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
    )
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps
        # * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    model.zero_grad()
    train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True)

    global_step = 0
    tr_loss = 0.0
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
        for step, batch in enumerate(epoch_iterator):
            source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch

            source = source.to(args.device)
            target = target.to(args.device)
            encoder_token_type_ids = encoder_token_type_ids.to(args.device)
            encoder_mask = encoder_mask.to(args.device)
            decoder_mask = decoder_mask.to(args.device)
            lm_labels = lm_labels.to(args.device)

            model.train()
            outputs = model(
                source,
                target,
                encoder_token_type_ids=encoder_token_type_ids,
                encoder_attention_mask=encoder_mask,
                decoder_attention_mask=decoder_mask,
                decoder_lm_labels=lm_labels,
            )

            loss = outputs[0]
            print(loss)
            if args.gradient_accumulation_steps > 1:
                loss /= args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                model.zero_grad()
                global_step += 1

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    return global_step, tr_loss / global_step
Exemplo n.º 14
0
    def train(
        self,
        train_dataset,
        output_dir,
        show_running_loss=True,
        eval_data=None,
        verbose=True,
        **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        model = self.model
        args = self.args
        device = self.device

        tb_writer = SummaryWriter(logdir=args["tensorboard_dir"])
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args["train_batch_size"])

        if args["max_steps"] > 0:
            t_total = args["max_steps"]
            args["num_train_epochs"] = (
                args["max_steps"] //
                (len(train_dataloader) // args["gradient_accumulation_steps"])
                + 1)
        else:
            t_total = len(train_dataloader) // args[
                "gradient_accumulation_steps"] * args["num_train_epochs"]

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args["weight_decay"],
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ]
            },
        ]

        warmup_steps = math.ceil(t_total * args["warmup_ratio"])
        args["warmup_steps"] = warmup_steps if args[
            "warmup_steps"] == 0 else args["warmup_steps"]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args["learning_rate"],
                          eps=args["adam_epsilon"])
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args["warmup_steps"],
            num_training_steps=t_total)

        if (args["model_name"] and os.path.isfile(
                os.path.join(args["model_name"], "optimizer.pt"))
                and os.path.isfile(
                    os.path.join(args["model_name"], "scheduler.pt"))):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(
                torch.load(os.path.join(args["model_name"], "optimizer.pt")))
            scheduler.load_state_dict(
                torch.load(os.path.join(args["model_name"], "scheduler.pt")))

        if args["fp16"]:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args["fp16_opt_level"])

        if args["n_gpu"] > 1:
            model = torch.nn.DataParallel(model)

        logger.info(" Training started")

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args["num_train_epochs"]),
                                desc="Epoch",
                                disable=args["silent"],
                                mininterval=0)
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0
        steps_trained_in_current_epoch = 0
        epochs_trained = 0

        if args["model_name"] and os.path.exists(args["model_name"]):
            try:
                # set global_step to gobal_step of last saved checkpoint from model path
                checkpoint_suffix = args["model_name"].split("/")[-1].split(
                    "-")
                if len(checkpoint_suffix) > 2:
                    checkpoint_suffix = checkpoint_suffix[1]
                else:
                    checkpoint_suffix = checkpoint_suffix[-1]
                global_step = int(checkpoint_suffix)
                epochs_trained = global_step // (
                    len(train_dataloader) //
                    args["gradient_accumulation_steps"])
                steps_trained_in_current_epoch = global_step % (
                    len(train_dataloader) //
                    args["gradient_accumulation_steps"])

                logger.info(
                    "   Continuing training from checkpoint, will skip to saved global_step"
                )
                logger.info("   Continuing training from epoch %d",
                            epochs_trained)
                logger.info("   Continuing training from global step %d",
                            global_step)
                logger.info(
                    "   Will skip the first %d steps in the current epoch",
                    steps_trained_in_current_epoch)
            except ValueError:
                logger.info("   Starting fine-tuning.")

        if args["evaluate_during_training"]:
            training_progress_scores = self._create_training_progress_scores(
                **kwargs)

        if args["wandb_project"]:
            wandb.init(project=args["wandb_project"],
                       config={**args},
                       **args["wandb_kwargs"])
            wandb.watch(self.model)

        model.train()
        for current_epoch in train_iterator:
            if epochs_trained > 0:
                epochs_trained -= 1
                continue
            # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Current iteration",
                         disable=args["silent"])):
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue
                batch = tuple(t.to(device) for t in batch)

                inputs = self._get_inputs_dict(batch)
                outputs = model(**inputs)
                # model outputs are always tuple in pytorch-transformers (see doc)
                loss = outputs[0]

                if args["n_gpu"] > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    print("\rRunning loss: %f" % loss, end="")

                if args["gradient_accumulation_steps"] > 1:
                    loss = loss / args["gradient_accumulation_steps"]

                if args["fp16"]:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    # torch.nn.utils.clip_grad_norm_(
                    #     amp.master_params(optimizer), args["max_grad_norm"]
                    # )
                else:
                    loss.backward()
                    # torch.nn.utils.clip_grad_norm_(
                    #     model.parameters(), args["max_grad_norm"]
                    # )

                tr_loss += loss.item()
                if (step + 1) % args["gradient_accumulation_steps"] == 0:
                    if args["fp16"]:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer),
                            args["max_grad_norm"])
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args["max_grad_norm"])

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args["logging_steps"] > 0 and global_step % args[
                            "logging_steps"] == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr",
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                             args["logging_steps"],
                                             global_step)
                        logging_loss = tr_loss
                        if args["wandb_project"]:
                            wandb.log({
                                "Training loss": current_loss,
                                "lr": scheduler.get_lr()[0],
                                "global_step": global_step,
                            })

                    if args["save_steps"] > 0 and global_step % args[
                            "save_steps"] == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        self._save_model(output_dir_current,
                                         optimizer,
                                         scheduler,
                                         model=model)

                    if args["evaluate_during_training"] and (
                            args["evaluate_during_training_steps"] > 0
                            and global_step %
                            args["evaluate_during_training_steps"] == 0):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results = self.eval_model(
                            eval_data,
                            verbose=verbose
                            and args["evaluate_during_training_verbose"],
                            silent=True,
                            **kwargs,
                        )
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        if args["save_eval_checkpoints"]:
                            self._save_model(output_dir_current,
                                             optimizer,
                                             scheduler,
                                             model=model,
                                             results=results)

                        training_progress_scores["global_step"].append(
                            global_step)
                        training_progress_scores["train_loss"].append(
                            current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(args["output_dir"],
                                         "training_progress_scores.csv"),
                            index=False,
                        )

                        if args["wandb_project"]:
                            wandb.log(
                                self._get_last_metrics(
                                    training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[
                                args["early_stopping_metric"]]
                            self._save_model(args["best_model_dir"],
                                             optimizer,
                                             scheduler,
                                             model=model,
                                             results=results)
                        if best_eval_metric and args[
                                "early_stopping_metric_minimize"]:
                            if (results[args["early_stopping_metric"]] -
                                    best_eval_metric <
                                    args["early_stopping_delta"]):
                                best_eval_metric = results[
                                    args["early_stopping_metric"]]
                                self._save_model(args["best_model_dir"],
                                                 optimizer,
                                                 scheduler,
                                                 model=model,
                                                 results=results)
                                early_stopping_counter = 0
                            else:
                                if args["use_early_stopping"]:
                                    if early_stopping_counter < args[
                                            "early_stopping_patience"]:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args['early_stopping_metric']}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args['early_stopping_patience']}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args['early_stopping_patience']} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step
                        else:
                            if (results[args["early_stopping_metric"]] -
                                    best_eval_metric >
                                    args["early_stopping_delta"]):
                                best_eval_metric = results[
                                    args["early_stopping_metric"]]
                                self._save_model(args["best_model_dir"],
                                                 optimizer,
                                                 scheduler,
                                                 model=model,
                                                 results=results)
                                early_stopping_counter = 0
                            else:
                                if args["use_early_stopping"]:
                                    if early_stopping_counter < args[
                                            "early_stopping_patience"]:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args['early_stopping_metric']}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args['early_stopping_patience']}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args['early_stopping_patience']} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step

            epoch_number += 1
            output_dir_current = os.path.join(
                output_dir,
                "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args["save_model_every_epoch"] or args[
                    "evaluate_during_training"]:
                os.makedirs(output_dir_current, exist_ok=True)

            if args["save_model_every_epoch"]:
                self._save_model(output_dir_current,
                                 optimizer,
                                 scheduler,
                                 model=model)

            if args["evaluate_during_training"]:
                results = self.eval_model(
                    eval_data,
                    verbose=verbose
                    and args["evaluate_during_training_verbose"],
                    silent=True,
                    **kwargs)

                self._save_model(output_dir_current,
                                 optimizer,
                                 scheduler,
                                 results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(os.path.join(args["output_dir"],
                                           "training_progress_scores.csv"),
                              index=False)

                if args["wandb_project"]:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args["early_stopping_metric"]]
                    self._save_model(args["best_model_dir"],
                                     optimizer,
                                     scheduler,
                                     model=model,
                                     results=results)
                if best_eval_metric and args["early_stopping_metric_minimize"]:
                    if results[args[
                            "early_stopping_metric"]] - best_eval_metric < args[
                                "early_stopping_delta"]:
                        best_eval_metric = results[
                            args["early_stopping_metric"]]
                        self._save_model(args["best_model_dir"],
                                         optimizer,
                                         scheduler,
                                         model=model,
                                         results=results)
                        early_stopping_counter = 0
                    else:
                        if args["use_early_stopping"] and args[
                                "early_stopping_consider_epochs"]:
                            if early_stopping_counter < args[
                                    "early_stopping_patience"]:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(
                                        f" No improvement in {args['early_stopping_metric']}"
                                    )
                                    logger.info(
                                        f" Current step: {early_stopping_counter}"
                                    )
                                    logger.info(
                                        f" Early stopping patience: {args['early_stopping_patience']}"
                                    )
                            else:
                                if verbose:
                                    logger.info(
                                        f" Patience of {args['early_stopping_patience']} steps reached"
                                    )
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return global_step, tr_loss / global_step
                else:
                    if results[args[
                            "early_stopping_metric"]] - best_eval_metric > args[
                                "early_stopping_delta"]:
                        best_eval_metric = results[
                            args["early_stopping_metric"]]
                        self._save_model(args["best_model_dir"],
                                         optimizer,
                                         scheduler,
                                         model=model,
                                         results=results)
                        early_stopping_counter = 0
                    else:
                        if args["use_early_stopping"] and args[
                                "early_stopping_consider_epochs"]:
                            if early_stopping_counter < args[
                                    "early_stopping_patience"]:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(
                                        f" No improvement in {args['early_stopping_metric']}"
                                    )
                                    logger.info(
                                        f" Current step: {early_stopping_counter}"
                                    )
                                    logger.info(
                                        f" Early stopping patience: {args['early_stopping_patience']}"
                                    )
                            else:
                                if verbose:
                                    logger.info(
                                        f" Patience of {args['early_stopping_patience']} steps reached"
                                    )
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return global_step, tr_loss / global_step

        return global_step, tr_loss / global_step
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mnli-mm": MnliMismatchedProcessor,
        "mrpc": MrpcProcessor,
        "sst-2": Sst2Processor,
        "sts-b": StsbProcessor,
        "qqp": QqpProcessor,
        "qnli": QnliProcessor,
        "rte": RteProcessor,
        "wnli": WnliProcessor,
    }

    output_modes = {
        "cola": "classification",
        "mnli": "classification",
        "mrpc": "classification",
        "sst-2": "classification",
        "sts-b": "regression",
        "qqp": "classification",
        "qnli": "classification",
        "rte": "classification",
        "wnli": "classification",
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
              cache_dir=cache_dir,
              num_labels=num_labels)

    ### RANDOM INITIALIZATION ####
    # config = BertConfig.from_dict({
    #         "attention_probs_dropout_prob": 0.1,
    #         "hidden_act": "gelu",
    #         "hidden_dropout_prob": 0.1,
    #         "hidden_size": 768,
    #         "initializer_range": 0.02,
    #         "intermediate_size": 3072,
    #         "max_position_embeddings": 512,
    #         "num_attention_heads": 12,
    #         "num_hidden_layers": 12,
    #         "type_vocab_size": 2,
    #         "vocab_size": 30522
    #     })
    # model = BertForSequenceClassification(config=config, num_labels=num_labels)


    ###############################

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits, _ = model(input_ids, segment_ids, input_mask, labels=None)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                print(loss.item())
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
    else:
        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
    model.to(device)

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)

        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []

        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits, attns = model(input_ids, segment_ids, input_mask, labels=None)

            # create eval loss and other metric required by the task
            if output_mode == "classification":
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
            elif output_mode == "regression":
                loss_fct = MSELoss()
                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = preds[0]
        if output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(task_name, preds, all_label_ids.numpy())
        loss = tr_loss/nb_tr_steps if args.do_train else None

        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        result['loss'] = loss

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        # hack for MNLI-MM
        if task_name == "mnli":
            task_name = "mnli-mm"
            processor = processors[task_name]()

            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
            if not os.path.exists(args.output_dir + '-MM'):
                os.makedirs(args.output_dir + '-MM')

            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []

            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(input_ids, segment_ids, input_mask, labels=None)
            
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
            
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
                else:
                    preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)

            eval_loss = eval_loss / nb_eval_steps
            preds = preds[0]
            preds = np.argmax(preds, axis=1)
            result = compute_metrics(task_name, preds, all_label_ids.numpy())
            loss = tr_loss/nb_tr_steps if args.do_train else None

            result['eval_loss'] = eval_loss
            result['global_step'] = global_step
            result['loss'] = loss

            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
Exemplo n.º 16
0
    def __init__(self, data_dir, label_dir, tokenizer, train_file='train.csv', val_file='val.csv', test_data=None,
                 label_file='labels.csv', text_col='text', label_col='label', batch_size_per_gpu=16, max_seq_length=512,
                 multi_gpu=True, multi_label=False, backend="nccl", model_type='bert', logger=None, clear_cache=False, no_cache=False):
        
        if isinstance(tokenizer, str):
            _,_,tokenizer_class = MODEL_CLASSES[model_type]
            # instantiate the new tokeniser object using the tokeniser name
            tokenizer = tokenizer_class.from_pretrained(tokenizer, do_lower_case=('uncased' in tokenizer))

        self.tokenizer = tokenizer  
        self.data_dir = data_dir
        self.cache_dir = data_dir/'cache'    
        self.max_seq_length = max_seq_length
        self.batch_size_per_gpu = batch_size_per_gpu
        self.train_dl = None
        self.val_dl = None
        self.test_dl = None
        self.multi_label = multi_label
        self.n_gpu = 0
        self.no_cache = no_cache
        self.model_type = model_type
        self.output_mode = 'classification'
        if logger is None:
            logger = logging.getLogger()
        self.logger = logger
        if multi_gpu:
            self.n_gpu = torch.cuda.device_count()
        
        if clear_cache:
            shutil.rmtree(self.cache_dir, ignore_errors=True)
            
        if multi_label:
            processor = MultiLabelTextProcessor(data_dir, label_dir)
        else:
            processor = TextProcessor(data_dir, label_dir)

        self.labels = processor.get_labels(label_file)
        
        if train_file:
            # Train DataLoader
            train_examples = None
            cached_features_file = os.path.join(self.cache_dir, 'cached_{}_{}_{}_{}'.format(
                                    self.model_type,
                                    'train',
                                    'multi_label' if self.multi_label else 'multi_class',
                                    str(self.max_seq_length)))
        
            if os.path.exists(cached_features_file) == False:
                train_examples = processor.get_train_examples(
                    train_file, text_col=text_col, label_col=label_col)  

            train_dataset = self.get_dataset_from_examples(train_examples, 'train')

            self.train_batch_size = self.batch_size_per_gpu * max(1, self.n_gpu)
            train_sampler = RandomSampler(train_dataset)
            self.train_dl = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.train_batch_size)
            

        if val_file:
            # Validation DataLoader
            val_examples = None
            cached_features_file = os.path.join(self.cache_dir, 'cached_{}_{}_{}_{}'.format(
                                    self.model_type,
                                    'dev',
                                    'multi_label' if self.multi_label else 'multi_class',
                                    str(self.max_seq_length)))
            
            if os.path.exists(cached_features_file) == False:
                val_examples = processor.get_dev_examples(
                    val_file, text_col=text_col, label_col=label_col)
            
            val_dataset = self.get_dataset_from_examples(val_examples, 'dev')
            
            self.val_batch_size = self.batch_size_per_gpu * max(1, self.n_gpu)
            val_sampler = SequentialSampler(val_dataset) 
            self.val_dl = DataLoader(val_dataset, sampler=val_sampler, batch_size=self.val_batch_size)
            
        
        if test_data:
            # Test set loader for predictions 
            test_examples = []
            input_data = []

            for index, text in enumerate(test_data):
                test_examples.append(InputExample(index, text))
                input_data.append({
                    'id': index,
                    'text': text
                })


            test_dataset = self.get_dataset_from_examples(test_examples, 'test', is_test=True)
            
            self.test_batch_size = self.batch_size_per_gpu * max(1, self.n_gpu)
            test_sampler = SequentialSampler(test_dataset)
            self.test_dl = DataLoader(test_dataset, sampler=test_sampler, batch_size=self.test_batch_size)
Exemplo n.º 17
0
def train(args, train_dataset, model, criterion, tokenizer):
    tb_writer = SummaryWriter(args.output_dir)

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // len(train_dataloader) + 1
    else:
        t_total = int(len(train_dataloader) * args.num_train_epochs)
        args.num_train_epochs = int(np.ceil(args.num_train_epochs))

    optimizer, scheduler = get_adamw(model, t_total, args.warmup_steps, args.learning_rate, weight_decay=args.weight_decay)

    # if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and \
    #     os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
    #     optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
    #     scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))

    train_desc = args.task_name
    if args.src_genre is not None and args.src_genre != '':
        train_desc += '-' + args.src_genre
    print(f'***** Fine-tuning {args.model_name_or_path} {train_desc} *****')
    print(f'    Num examples = {len(train_dataset)}')
    print(f'    Num Epochs = {args.num_train_epochs}')
    print(f'    Train batch size = {args.train_batch_size}')
    print(f'    Total optimization steps = {t_total}')

    ckpt_steps = set([int(x) for x in np.linspace(0, t_total, args.num_ckpts + 1)[1:]])

    model.train()
    model.zero_grad()

    global_step = 0
    step_loss = []
    eval_results = []

    pbar = tqdm(total=t_total, desc=f'train')
    set_seed(args)
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            loss = get_loss(args.model_type, model, criterion, batch)
            loss.backward()
            if args.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()
            scheduler.step()
            step_loss.append(loss.item())
            global_step += 1
            pbar.update(1)
            pbar.set_description_str(f'train: {train_desc} (loss = {step_loss[-1]:.2f}, lr = {scheduler.get_lr()[0]:.2g})')

            if global_step in ckpt_steps:
                ckpt_path = os.path.join(args.output_dir, f'step_{global_step}.bin')
                torch.save(model, ckpt_path)

                if args.do_eval:
                    step_eval_results = evaluate(args, model, criterion, tokenizer)
                    tr_loss = np.mean(step_loss)
                    if len(step_eval_results) == 2:
                        eval_loss = step_eval_results['mnli']
                        eval_loss_mm = step_eval_results['mnli-mm']
                        eval_results.append([global_step, tr_loss, eval_loss, eval_loss_mm])
                        print(f'\nSaving model checkpoint to {ckpt_path}, avg_loss = {tr_loss:.2f}, eval_loss = {eval_loss:.2f}, '
                                    f'eval_loss_mm = {eval_loss_mm:.2f}\n')
                    else:
                        eval_loss = step_eval_results['mnli']
                        eval_results.append([global_step, tr_loss, eval_loss])
                        print(f'\nSaving model checkpoint to {ckpt_path}, avg_loss = {tr_loss:.2f}, eval_loss = {eval_loss:.2f}\n')
                else:
                    print(f'\nSaving model checkpoint to {ckpt_path}\n')

            if global_step % args.logging_steps == 0:
                tb_writer.add_scalar('learning_rate', scheduler.get_lr()[0], global_step)
                tb_writer.add_scalar('loss', np.mean(step_loss), global_step)
                step_loss = []

            if global_step == args.max_steps:
                pbar.close()
                break

    if args.do_eval:
        if len(eval_results[0]) == 4:
            header = ['step', 'avg_loss', 'eval_loss', 'mm_loss']
        else:
            header = ['step', 'avg_loss', 'eval_loss']
        best_results = report_results(header, eval_results, 2)
        best_step = best_results[0]
        print(f'best_ckpt = {os.path.join(args.output_dir, f"step_{best_step}.bin")}\n')
Exemplo n.º 18
0
def train(model, tokenizer, train_dataset, eval_dataset, batch_size, lr, adam_epsilon,
          epochs, output_dir):
    """

    :param model: Bert Model to train
    :param tokenizer: Bert Tokenizer to train
    :param train_dataset:
    :param batch_size: Stick to 1 if not using using a high end GPU
    :param lr: Suggested learning rate from paper is 5e-5
    :param adam_epsilon: Used for weight decay fixed suggested parameter is
    1e-8
    :param epochs: Usually a single pass through the entire dataset is
    satisfactory
    :return: Loss
    """

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=batch_size)
    train_positions_to_mask = train_dataset.positions_to_mask

    t_total = len(train_dataloader) // batch_size  # Total Steps

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if
                    not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(
            nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 0, t_total)

    # ToDo Case for fp16

    # Start of training loop
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Batch size = %d", batch_size)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.resize_token_embeddings(len(tokenizer))
    train_iterator = trange(int(epochs), desc="Epoch")
    epoch_info = []
    proceed = False
    tmp_global_step = 0
    for _ in train_iterator:
        epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration")
        model.train()
        with torch.set_grad_enabled(True):
            for i, batch in enumerate(epoch_iterator):
                
                if tmp_global_step >= global_step or proceed:
                    proceed = True
                else:
                    tmp_global_step += 1
                
                if proceed:
                    inputs, labels = custom_mask_tokens(batch, tokenizer, train_positions_to_mask[i])
                    inputs = inputs.to('cuda')  # Don't bother if you don't have a gpu
                    labels = labels.to('cuda')
                    outputs = model(inputs, masked_lm_labels=labels)
                    # model outputs are always tuple in transformers (see doc)
                    loss = outputs[0]

                    loss.backward()
                    tr_loss += loss.item()

                    # if (step + 1) % 1 == 0: # 1 here is a placeholder for gradient
                    # accumulation steps
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1

        if proceed:
            with torch.set_grad_enabled(False):
                epoch_info = eval_and_save_model(output_dir, eval_dataset, global_step, epoch_info, model, optimizer, tokenizer)
            logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
            tr_loss = 0

    return model, tokenizer
Exemplo n.º 19
0
    optimizer = optim.Adadelta(model.params_requires_grad(),
                               weight_decay=config.weight_decay,
                               lr=config.learning_rate,
                               eps=config.adam_eps)
    print(model)

    global_steps = 0.
    f1_best = 0.
    f1_test = 0.
    logging_loss, tr_loss = 0., 0.
    epoch_improve = 0.
    restart_used = 0
    model_name = 'model_gcn_2018.ckpt'
    log_name = 'log_gcn2018.txt'
    tensorboard_name = 'model_1.ckpt'
    train_sampler = RandomSampler(train_dataset)
    train_loader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=config.batch_size)
    total_steps = len(train_loader) * config.num_epoch
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.warmup_steps,
        num_training_steps=total_steps)
    tb_writer = SummaryWriter(os.path.join(config.output_dir,
                                           tensorboard_name))
    identity_matrix = torch.eye(config.max_sent).unsqueeze(0)

    print('-> Start training process')
    print('nepoch: ', config.num_epoch)
    print('total step: ', total_steps)
Exemplo n.º 20
0
def run_aug(args, save_every_epoch=False):
    processors = {
        # you can your processor here
        "TREC": AugProcessor,
        "stsa.fine": AugProcessor,
        "stsa.binary": AugProcessor,
        "mpqa": AugProcessor,
        "rt-polarity": AugProcessor,
        "subj": AugProcessor,
        "toxic": AugProcessor
    }

    task_name = args.task_name
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    args.data_dir = os.path.join(args.data_dir, task_name)
    args.output_dir = os.path.join(args.output_dir, task_name)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    os.makedirs(args.output_dir, exist_ok=True)
    processor = processors[task_name]()
    label_list = processor.get_labels(task_name)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    train_examples = processor.get_train_examples(args.data_dir)
    #dev_examples = processor.get_dev_examples(args.data_dir)
    #train_examples.extend(dev_examples)
    num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs)

    # Prepare model
    def load_model(model_name):
        weights_path = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, model_name)
        model = torch.load(weights_path)
        return model
    cbert_name = "{}/BertForMaskedLM_{}_epoch_10".format(task_name.lower(), task_name.lower())
    model = load_model(cbert_name)
    model.cuda()

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]
    t_total = num_train_steps
    optimizer = BertAdam(optimizer_grouped_parameters,lr=args.learning_rate,
                         warmup=args.warmup_proportion,t_total=t_total)

    global_step = 0
    train_features = convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)
    all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long)
    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_masked_lm_labels = torch.tensor([f.masked_lm_labels for f in train_features], dtype=torch.long)
    train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name)
    if not os.path.exists(save_model_dir):
        os.mkdir(save_model_dir)
    MASK_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0]

    origin_train_path = os.path.join(args.output_dir, "train_origin.tsv")
    save_train_path = os.path.join(args.output_dir, "train.tsv")
    shutil.copy(origin_train_path, save_train_path)
    best_test_acc = train_text_classifier.train("aug_data")
    print("before augment best acc:{}".format(best_test_acc))

    for e in trange(int(args.num_train_epochs), desc="Epoch"):
        avg_loss = 0.

        for step, batch in enumerate(train_dataloader):
            model.train()
            batch = tuple(t.cuda() for t in batch)
            _, input_ids, input_mask, segment_ids, masked_ids = batch
            loss = model(input_ids, segment_ids, input_mask, masked_ids)
            loss.backward()
            avg_loss += loss.item()
            optimizer.step()
            model.zero_grad()
            if (step + 1) % 50 == 0:
                print("avg_loss: {}".format(avg_loss / 50))
                avg_loss = 0
        torch.cuda.empty_cache()
        shutil.copy(origin_train_path, save_train_path)
        save_train_file = open(save_train_path, 'a')
        tsv_writer = csv.writer(save_train_file, delimiter='\t')
        #tsv_writer.writerow(['sentence', 'label'])
        for step, batch in enumerate(train_dataloader):
            model.eval()
            batch = tuple(t.cuda() for t in batch)
            init_ids, _, input_mask, segment_ids, _ = batch
            input_lens = [sum(mask).item() for mask in input_mask]
            #masked_idx = np.squeeze([np.random.randint(1, l-1, 1) for l in input_lens])
            masked_idx = np.squeeze([np.random.randint(0, l, max(l//7,2)) for l in input_lens])
            for ids, idx in zip(init_ids,masked_idx):
                ids[idx] = MASK_id
            predictions = model(init_ids, segment_ids, input_mask)
            for ids, idx, preds, seg in zip(init_ids, masked_idx, predictions, segment_ids):
                #pred = torch.argsort(pred)[:,-e-1][idx]
                '''
                pred = torch.argsort(preds)[:,-1][idx]
                ids[idx] = pred
                new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy())
                new_str = rev_wordpiece(new_str)
                tsv_writer.writerow([new_str, seg[0].item()])
                '''
                pred = torch.argsort(preds)[:, -2][idx]
                ids[idx] = pred
                new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy())
                new_str = rev_wordpiece(new_str)
                tsv_writer.writerow([new_str, seg[0].item()])
            torch.cuda.empty_cache()
        predictions = predictions.detach().cpu()
        torch.cuda.empty_cache()
        bak_train_path = os.path.join(args.output_dir, "train_epoch_{}.tsv".format(e))
        shutil.copy(save_train_path, bak_train_path)
        best_test_acc = train_text_classifier.train("aug_data")
        print("epoch {} augment best acc:{}".format(e, best_test_acc))
        if save_every_epoch:
            save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1)
            save_model_path = os.path.join(save_model_dir, save_model_name)
            torch.save(model, save_model_path)
        else:
            if (e + 1) % 10 == 0:
                save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1)
                save_model_path = os.path.join(save_model_dir, save_model_name)
                torch.save(model, save_model_path)
Exemplo n.º 21
0
def train(model, optimizer, scheduler, dataset_train, dataset_valid, dataset_test, config, evaluator):  # valid_feed can be None

	patience = 10  # wait for at least 10 epoch before stop
	valid_loss_threshold = np.inf
	best_valid_loss = np.inf
	best_eval_valid = 0.0
	final_eval_best = 0.0

	sampler_train = RandomSampler(dataset_train) if config.local_rank == -1 else DistributedSampler(dataset_train)
	# sampler_train = SequentialSampler(dataset_train) if config.local_rank == -1 else DistributedSampler(dataset_train)
	dataloader_train = DataLoader(dataset_train, sampler=sampler_train, batch_size=config.batch_size)

	batch_cnt = 0
	ckpt_step = len(dataloader_train.dataset) // dataloader_train.batch_size
	logger.info("**** Training Begins ****")
	logger.info("**** Epoch 0/{} ****".format(config.max_epoch))

	loss_func = None
	# if config.use_parallel and not config.use_apex:
	# if config.n_gpu > 1 and not config.use_apex:
	if config.n_gpu > 1 and config.local_rank == -1:
		loss_func = get_loss_func(config=config, pad_id=model.module.pad_id)
	else:
		loss_func = get_loss_func(config=config, pad_id=model.pad_id)

	if config.use_gpu:
		loss_func.cuda()

	# epoch loop
	model.train()
	for cur_epoch in range(config.max_epoch):

		# loop until traverse all batches
		train_loss = []
		for text_inputs, label_y, *remains in dataloader_train:
			mask_input = remains[0]
			len_seq = remains[1]
			len_sents = remains[2]
			tid = remains[3]
			len_para = remains[4]

			text_inputs = utils.cast_type(text_inputs, LONG, config.use_gpu)
			mask_input = utils.cast_type(mask_input, FLOAT, config.use_gpu)
			len_seq = utils.cast_type(len_seq, FLOAT, config.use_gpu)

			# training for this batch
			optimizer.zero_grad()
			
			coh_score = model(text_inputs=text_inputs, mask_input=mask_input, len_seq=len_seq, len_sents=len_sents, tid=tid, len_para=len_para, mode="")  # model.forward; now it returns the loss
			
			if config.output_size == 1:
				coh_score = coh_score.view(text_inputs.shape[0])
			else:
				coh_score = coh_score.view(text_inputs.shape[0], -1)

#           # get loss
			if config.output_size == 1:
				label_y = utils.cast_type(label_y, FLOAT, config.use_gpu)
			else:
				label_y = utils.cast_type(label_y, LONG, config.use_gpu)
			label_y = label_y.view(text_inputs.shape[0])

			# print(coh_score)
			# print(label_y)

			loss = loss_func(coh_score, label_y)
			if config.n_gpu > 1:
				loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training

			loss.backward()
			# with amp_handle.scale_loss(loss, optimizer) as scaled_loss:
			#     scaled_loss.backward()

			# print(coh_score)
			# print(label_y)
			# print()
			
			torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
			#for p in model.parameters():  # gradient control manually
			#    if p.grad is not None:
			#        p.data.add_(-config.init_lr, p.grad.data)
			# clip_grad_norm_(amp.master_params(optimizer), config.clip_n2)

			# update optimizer and scheduler
			optimizer.step()
			if scheduler is not None:
				# scheduler.step()
				scheduler.step(loss)

			# for param_group in optimizer.param_groups:
			# 	print(param_group['lr'])

			train_loss.append(loss.item())

			# temporal averaging for encoders (proposed in ICLR18)
			if config.encoder_type == "reg_lstm" and config.beta_ema > 0:
				if config.n_gpu > 1:    model.module.encoder_coh.update_ema()
				else:   model.encoder_coh.update_ema()

			batch_cnt = batch_cnt + 1

			# print train process
			if batch_cnt % config.print_step == 0:
				logger.info("{}/{}-({:.3f})".format(batch_cnt % config.ckpt_step,
																		 config.ckpt_step,
																		 loss))

			## validation
			if batch_cnt % ckpt_step == 0:  # manual epoch printing
			# if i == batch_num-1:  # every epoch
				logger.info("\n=== Evaluating Model ===")

				# validation
				eval_cur_valid = -1
				if dataset_valid is not None:
					loss_valid, eval_cur_valid, _ = validate(model, evaluator, dataset_valid, config, loss_func)
					logger.info("")

				if eval_cur_valid >= best_eval_valid or dataset_valid is None:
				# if dataset_valid is not None:
					logger.info("Best {} on Valid {}".format(evaluator.eval_type, eval_cur_valid))
					best_eval_valid = eval_cur_valid

					valid_loss, eval_last, eval_best = validate(model, evaluator, dataset_test, config, loss_func, is_test=True)
					if eval_best > final_eval_best: 
						final_eval_best = eval_best

						# save model
						if config.save_model:
							logger.info("Model Saved.")
							torch.save(model.state_dict(), os.path.join(config.session_dir, "model"))

						# save prediction log for error analysis
						if config.gen_logs:
							pred_log_name = "log_pred_" + str(config.essay_prompt_id_train) + "_" + str(config.essay_prompt_id_test) + "_" + str(config.cur_fold) + ".log"
							if config.eval_type.lower() == "qwk":
								pred_out = np.stack((evaluator.rescaled_pred, evaluator.origin_label_np, evaluator.tid_np))
								np.savetxt(os.path.join(config.session_dir, pred_log_name), pred_out, fmt ='%.0f')
							elif config.eval_type.lower() == "accuracy":
								pred_out = np.stack((evaluator.pred_list_np, evaluator.origin_label_np, evaluator.tid_np))
								pred_out = pred_out.T
								np.savetxt(os.path.join(config.session_dir, pred_log_name), pred_out, fmt ='%.0f')


						# # error analysis: std data for lexical cohesion
						# if config.gen_logs and config.target_model == "ilcr_scd":
						# 	std_log_name = "log_std_" + str(config.essay_prompt_id_train) + "_" + str(config.essay_prompt_id_test) + "_" + str(config.cur_fold) + ".log"
						# 	# # file read
						# 	std_data = evaluator.map_suppl["std"]
						# 	with open(os.path.join(config.session_dir, std_log_name), "w") as f:
						# 		f.write(repr(std_data))

				evaluator.map_suppl={}  # reset

				# early stopping parts (disabled)
				# if valid_loss < best_valid_loss:
				#     if valid_loss <= valid_loss_threshold * config.improve_threshold:
				#         patience = max(patience,
				#                        cur_epoch * config.patient_increase)
				#         valid_loss_threshold = valid_loss
				#         logger.info("Update patience to {}".format(patience))
				#     # end if if valid_loss <= valid_loss_threshold * config.improve_threshold
				#
				#     best_valid_loss = valid_loss
				# # end if valid_loss < best_valid_loss:

				# if cur_epoch >= config.max_epoch \
				#         or config.early_stop and patience <= cur_epoch:
				#     if cur_epoch < config.max_epoch:
				#         logger.info("!!Early stop due to run out of patience!!")
				#
				#     logger.info("Best validation loss %f" % best_valid_loss)
				#
				#     return
				# end if if cur_epoch >= config.max_epoch \

				# exit eval model
				model.train()
				train_loss = []
				logger.info("\n**** Epcoch {}/{} ****".format(cur_epoch,
															  config.max_epoch))
			# end valdation

			if config.use_gpu and config.empty_cache:
				torch.cuda.empty_cache()    # due to memory shortage
		# end batch loop
	# end epoch loop
	logger.info("Best {} on Test {}".format(evaluator.eval_type, final_eval_best))
	logger.info("")

	return final_eval_best
def train(args, train_dataset, model, tokenizer):
  """ Train the model """
  if args.local_rank in [-1, 0]:
    tb_writer = SummaryWriter()

  args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
  train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
  train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

  if args.max_steps > 0:
    t_total = args.max_steps
    args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
  else:
    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

  # Prepare optimizer and schedule (linear warmup and decay)
  no_decay = ['bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
  scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
  if args.fp16:
    try:
      from apex import amp
    except ImportError:
      raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
    model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

  # multi-gpu training (should be after apex fp16 initialization)
  if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

  # Distributed training (should be after apex fp16 initialization)
  if args.local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                              output_device=args.local_rank,
                              find_unused_parameters=True)

  # Train!
  logger.info("***** Running training *****")
  logger.info("  Num examples = %d", len(train_dataset))
  logger.info("  Num Epochs = %d", args.num_train_epochs)
  logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
  logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
           args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
  logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
  logger.info("  Total optimization steps = %d", t_total)

  global_step = 0
  tr_loss, logging_loss = 0.0, 0.0
  model.zero_grad()
  train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
  set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
  for _ in train_iterator:
    loss_this_epoch = 0 
    epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
    for step, batch in enumerate(epoch_iterator):
      inputs, labels, attention_mask = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
      inputs = inputs.to(args.device)
      labels = labels.to(args.device)
      attention_mask = attention_mask.to(args.device)
      model.train()
      outputs = model(inputs, attention_mask=attention_mask, masked_lm_labels=labels)  # if args.mlm else model(inputs, labels=labels)
      loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

      loss_this_epoch = loss + loss_this_epoch

      if args.n_gpu > 1:
        loss = loss.mean()  # mean() to average on multi-gpu parallel training
      if args.gradient_accumulation_steps > 1:
        loss = loss / args.gradient_accumulation_steps

      if args.fp16:
        with amp.scale_loss(loss, optimizer) as scaled_loss:
          scaled_loss.backward()
      else:
        loss.backward()

      tr_loss += loss.item()
      if (step + 1) % args.gradient_accumulation_steps == 0:
        if args.fp16:
          torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
        else:
          torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        model.zero_grad()
        global_step += 1

        if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
          # Log metrics
          if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
            results = evaluate(args, model, tokenizer)
            for key, value in results.items():
              tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
          tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
          tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
          logging_loss = tr_loss

        if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
          # Save model checkpoint
          output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
          if not os.path.exists(output_dir):
            os.makedirs(output_dir)
          model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
          model_to_save.save_pretrained(output_dir)
          torch.save(args, os.path.join(output_dir, 'training_args.bin'))
          logger.info("Saving model checkpoint to %s", output_dir)

      if args.max_steps > 0 and global_step > args.max_steps:
        epoch_iterator.close()
        break
    if args.max_steps > 0 and global_step > args.max_steps:
      train_iterator.close()
      break

    print ('\ntrain loss epoch ... {} (not exact loss)'.format(loss_this_epoch/step))

  if args.local_rank in [-1, 0]:
    tb_writer.close()

  return global_step, tr_loss / global_step
Exemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained BERT model. "
        "This specifies the model architecture.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    ## Required parameters
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--train_ans_file",
        default=None,
        type=str,
        help="SQuAD answer for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--restore', default=False)
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        #        torch.backends.cudnn.benchmark = True
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) == False:
        # raise ValueError("Output directory () already exists and is not empty.")
        os.makedirs(args.output_dir, exist_ok=True)

    import pickle as cPickle
    train_examples = None
    num_train_steps = None
    if args.do_train:
        raw_test_data = open(args.predict_file, mode='r')
        raw_train_data = open(args.train_file, mode='r')
        if os.path.exists("train_file_baseline.pkl") and False:
            train_examples = cPickle.load(
                open("train_file_baseline.pkl", mode='rb'))
        else:
            ans_dict = {}
            with open(args.train_ans_file) as f:
                for line in f:
                    line = line.split(',')
                    ans_dict[line[0]] = int(line[1])
            train_examples = read_chid_examples(raw_train_data,
                                                is_training=True,
                                                ans_dict=ans_dict)
            cPickle.dump(train_examples,
                         open("newtrain_file_baseline.pkl", mode='wb'))

        #tt = len(train_examples) // 2
        #train_examples = train_examples[:tt]

        logger.info("train examples {}".format(len(train_examples)))
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    bert_config = BertConfig.from_json_file(args.bert_config_file)
    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)
    model = BertForCloze(bert_config, num_choices=10)
    if args.init_checkpoint is not None:
        logger.info('load bert weight')
        state_dict = torch.load(args.init_checkpoint, map_location='cpu')
        missing_keys = []
        unexpected_keys = []
        error_msgs = []
        # copy state_dict so _load_from_state_dict can modify it
        metadata = getattr(state_dict, '_metadata', None)
        state_dict = state_dict.copy()
        # new_state_dict=state_dict.copy()
        # for kye ,value in state_dict.items():
        #     new_state_dict[kye.replace("bert","c_bert")]=value
        # state_dict=new_state_dict
        if metadata is not None:
            state_dict._metadata = metadata

        def load(module, prefix=''):
            local_metadata = {} if metadata is None else metadata.get(
                prefix[:-1], {})

            module._load_from_state_dict(state_dict, prefix, local_metadata,
                                         True, missing_keys, unexpected_keys,
                                         error_msgs)
            for name, child in module._modules.items():
                # logger.info("name {} chile {}".format(name,child))
                if child is not None:
                    load(child, prefix + name + '.')

        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
        logger.info("missing keys:{}".format(missing_keys))
        logger.info('unexpected keys:{}'.format(unexpected_keys))
        logger.info('error msgs:{}'.format(error_msgs))
    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex import amp
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate)
        # optimizer = BertAdam(optimizer_grouped_parameters,
        #                      lr=args.learning_rate,
        #                      warmup=args.warmup_proportion,
        #                      t_total=t_total)
        # optimizer = RAdam(optimizer_grouped_parameters,
        #                      lr=args.learning_rate)
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.restore:
        checkpoint = torch.load('amp_checkpoint.pt')

        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        amp.load_state_dict(checkpoint['amp'])

    global_step = 0
    if args.do_train:
        cached_train_features_file = args.train_file + '_{0}_v{1}'.format(
            str(args.max_seq_length), str(4))
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length)

            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_labels = torch.tensor([f.label for f in train_features],
                                  dtype=torch.long)
        all_option_ids = torch.tensor([f.option_ids for f in train_features],
                                      dtype=torch.long)
        all_positions = torch.tensor([f.position for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_labels, all_option_ids,
                                   all_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      drop_last=True,
                                      pin_memory=True)
        loss_ini = 50
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            vizname = 'epoch' + str(_)
            viz = Visdom(env=str(vizname))
            vis = Visdom(env='loss')
            via = Visdom(env='ac')
            model.train()
            model.zero_grad()
            epoch_itorator = tqdm(train_dataloader, disable=None)
            for step, batch in enumerate(epoch_itorator):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, labels, option_ids, positions = batch
                loss = model(input_ids, option_ids, segment_ids, input_mask,
                             positions, labels)
                #                print('att', loss.size())
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    #                    model, optimizer = amp.initialize(model, optimizer, opt_level= "O1")
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    # if args.fp16:
                    #      torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.)
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % 1000 == 0:
                    logger.info("loss@{}:{}".format(step, loss.cpu().item()))
                steptotal = step + _ * int(
                    len(train_examples) / args.train_batch_size)
                if (steptotal + 1) % 50 == 0:
                    vis.line([loss.cpu().item()], [steptotal],
                             win='train_loss',
                             update='append')
                if (step + 1) % 50 == 0:
                    viz.line([loss.cpu().item()], [step],
                             win='train_loss',
                             update='append')
            loss_total = str(loss.cpu().item())
            print(loss_total)
            loss_ini = loss_total
            logger.info("loss:%f", loss.cpu().item())
            logger.info("loss+:{}".format(loss.cpu().item()))
            raw_test_data_pre = open(args.predict_file, mode='r')
            eval_examples = read_chid_examples(raw_test_data_pre,
                                               is_training=False)
            # eval_examples=eval_examples[:100]
            eval_features = convert_examples_to_features(
                examples=eval_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length)

            logger.info("***** Running predictions *****")
            logger.info("  Num orig examples = %d", len(eval_examples))
            logger.info("  Num split examples = %d", len(eval_features))
            logger.info("  Batch size = %d", args.predict_batch_size)

            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
            all_option_ids = torch.tensor(
                [f.option_ids for f in eval_features], dtype=torch.long)
            all_positions = torch.tensor([f.position for f in eval_features],
                                         dtype=torch.long)
            all_tags = torch.tensor([f.tag for f in eval_features],
                                    dtype=torch.long)
            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_option_ids,
                                      all_positions, all_tags)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.predict_batch_size)

            model.eval()
            reader1 = pd.read_csv('dev_answer1.csv', usecols=[1], header=None)
            total_dev_loss = 0
            all_results = {}
            logger.info("Start evaluating")
            for input_ids, input_mask, segment_ids, option_ids, positions, tags in \
                    tqdm(eval_dataloader, desc="Evaluating",disable=None):
                if len(all_results) % 1000 == 0:
                    logger.info("Processing example: %d" % (len(all_results)))
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                option_ids = option_ids.to(device)
                positions = positions.to(device)
                with torch.no_grad():
                    batch_logits, align = model(input_ids, option_ids,
                                                segment_ids, input_mask,
                                                positions)
                for i, tag in enumerate(tags):
                    logits = batch_logits[i].detach().cpu().numpy()
                    logit = [logits]
                    logit = torch.tensor(logit)

                    inum = int(tag) - 577157
                    dlabel = [reader1[1][inum]]
                    dlabel = torch.tensor(dlabel)
                    # loss_dev =FocalLoss(gamma=0.25)
                    loss_dev = CrossEntropyLoss()
                    dev_loss = loss_dev(logit, dlabel)
                    total_dev_loss += dev_loss
                    #  for index1, dlabel in zip(reader1[0], reader1[1]):
                    #      if index1[6:11] == str(tag):
                    #          loss_dev =CrossEntropyLoss()
                    #          dev_loss = loss_dev(logits, dlabel)
                    #          total_dev_loss += dev_loss
                    #          continue
                    ans = np.argmax(logits)
                    all_results["#idiom%06d#" % tag] = ans

            predict_name = "ln11saprediction" + str(_) + ".csv"
            output_prediction_file = os.path.join(args.output_dir,
                                                  predict_name)
            with open(output_prediction_file, "w") as f:
                for each in all_results:
                    f.write(each + ',' + str(all_results[each]) + "\n")
            raw_test_data.close()
            pre_ac = 0
            outputpre = 'output_model/' + predict_name
            reader2 = pd.read_csv(outputpre, usecols=[0, 1], header=None)

            for index2, ans2 in zip(reader2[0], reader2[1]):
                num = index2[6:12]
                num = int(num) - 577157
                ans1 = reader1[1][num]
                if ans1 == ans2:
                    pre_ac += 1
            print(pre_ac)
            per = (pre_ac) / 23011
            pernum = per * 100
            logger.info("accuracy:%f", pernum)
            devlossmean = total_dev_loss / (23011 / 128)
            logger.info("devloss:%f", devlossmean)
            via.line([pernum], [_], win='accuracy', update='append')
            via.line([devlossmean], [_], win='loss', update='append')
            checkpoint = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'amp': amp.state_dict()
            }
            torch.save(checkpoint, 'checkpoint/amp_checkpoint.pt')

            outmodel = 'ln11samodel' + str(pernum) + '.bin'
            output_model_file = os.path.join(args.output_dir, outmodel)
            if args.do_train:
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                torch.save(model_to_save.state_dict(), output_model_file)
        raw_test_data.close()
        raw_train_data.close()


# Save a trained model
#    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
#    if args.do_train:
#        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
#        torch.save(model_to_save.state_dict(), output_model_file)

# Load a trained model that you have fine-tuned

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        list1 = os.listdir('./output_model/')
        list1 = sorted(
            list1,
            key=lambda x: os.path.getmtime(os.path.join('./output_model/', x)))
        output_model_file = os.path.join(args.output_dir, list1[-1])
        # output_model_file = os.path.join(args.output_dir, 'n11samodel77.33258007040111.bin')
        model_state_dict = torch.load(output_model_file)
        model = BertForCloze(bert_config, num_choices=10)
        model.load_state_dict(model_state_dict)
        model.to(device)
        if n_gpu > 1:
            model = torch.nn.DataParallel(model)
        # raw_test_data_pre = open('./data/dev.txt', mode='r')
        raw_test_data_pre = open('./data/out.txt', mode='r')
        # raw_test_data_pre = open('new_test_data.txt', mode='r')
        eval_examples = read_chid_examples(raw_test_data_pre,
                                           is_training=False)
        # eval_examples=eval_examples[:100]
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_option_ids = torch.tensor([f.option_ids for f in eval_features],
                                      dtype=torch.long)
        all_positions = torch.tensor([f.position for f in eval_features],
                                     dtype=torch.long)
        all_tags = torch.tensor([f.tag for f in eval_features],
                                dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_option_ids,
                                  all_positions, all_tags)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = {}
        all_results1 = {}
        all_results2 = {}
        # reader1 = pd.read_csv('test_ans.csv', usecols=[1], header=None)
        reader1 = pd.read_csv('./data/out_answer.csv',
                              usecols=[1],
                              header=None)  #dev_answer1.csv
        # reader1 = pd.read_csv('dev_answer1.csv', usecols=[1], header=None)
        total_dev_loss = 0
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, option_ids, positions, tags in \
                tqdm(eval_dataloader, desc="Evaluating",disable=None):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            option_ids = option_ids.to(device)
            positions = positions.to(device)
            with torch.no_grad():
                batch_logits, align = model(input_ids, option_ids, segment_ids,
                                            input_mask, positions)
            for i, tag in enumerate(tags):
                logits = batch_logits[i].detach().cpu().numpy()

                ans = np.argmax(logits)
                all_results["#idiom%06d#" % tag] = ans

                # matric = align[i].detach().cpu().numpy()
                # all_results1["#idiom%06d#" % tag] = matric[ans]
                # gr_logic = logits[:]
                # gr_logic = sorted(gr_logic, reverse=True)
                # all_results2["#idiom%06d#" % tag] = gr_logic

        output_prediction_file = os.path.join(args.output_dir,
                                              "testprediction.csv")
        # output_m_file = os.path.join(args.output_dir, "ealign.csv")
        # output_ma_file = os.path.join(args.output_dir, "sdmv.csv")

        with open(output_prediction_file, "w") as f:
            for each in all_results:
                f.write(each + ',' + str(all_results[each]) + "\n")
        # with open(output_m_file, "w") as f:
        #     for each in all_results1:
        #         f.write(each + ',' + str(all_results1[each]) + "\n")
        # with open(output_ma_file, "w") as f:
        #     for each in all_results1:
        #         f.write(each + ',' + str(all_results2[each]) + "\n")
        raw_test_data_pre.close()
        reader2 = pd.read_csv(output_prediction_file,
                              usecols=[0, 1],
                              header=None)
        pre_ac = 0
        for index2, ans2 in zip(reader2[0], reader2[1]):
            num = index2[6:-1]
            # num = int(num)-1
            # num = re.findall(r"\d+\.?\d*",index2)
            num = int(num) - 623377
            # num = int(num) - 577157
            ans1 = reader1[1][num]
            if ans1 == ans2:
                pre_ac += 1
        print(pre_ac)
        # per = (pre_ac)/23011
        # per = (pre_ac)/24948
        per = (pre_ac) / 27704
        pernum = per * 100
        logger.info("accuracy:%f", pernum)
Exemplo n.º 24
0
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter(comment=args.summary_comment)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)

    if args.wiki_dataset:
        collate_fn = functools.partial(collate_wiki, tokenizer)
    else:
        collate_fn = functools.partial(collate, tokenizer)

    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        collate_fn=collate_fn,
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info(
                "  Will skip the first %d steps in the first epoch",
                steps_trained_in_current_epoch,
            )
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            if args.wiki_dataset:
                if args.mlm:
                    raise RuntimeError(
                        "Can't do mlm for wiki / dictionary dataset")

                tokens, loss_mask = batch
                inputs, labels = (tokens, tokens)

                loss_mask = loss_mask.to(args.device)
                loss_weights = (~loss_mask) + loss_mask * args.title_scale
                inputs = inputs.to(args.device)
                labels = labels.to(args.device)
                model.train()
                outputs = model(inputs,
                                labels=labels,
                                loss_weights=loss_weights)
            else:
                inputs, labels = mask_tokens(
                    batch, tokenizer, args) if args.mlm else (batch, batch)
                inputs = inputs.to(args.device)
                labels = labels.to(args.device)
                model.train()
                outputs = model(
                    inputs, masked_lm_labels=labels) if args.mlm else model(
                        inputs, labels=labels)

            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar(
                        "loss",
                        (tr_loss - logging_loss) / args.logging_steps,
                        global_step,
                    )
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
def main(args):
    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    # logger
    logger_file_name = args.save_dir.split('/')[1]
    fileHandler = logging.FileHandler(
        os.path.join(args.save_dir, "%s.txt" % (logger_file_name)))
    logger.addHandler(fileHandler)
    logger.info(args)

    # cuda setup
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info("device: {}".format(device))

    # set random seed
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    if device == "cuda":
        torch.cuda.manual_seed(args.random_seed)
        torch.cuda.manual_seed_all(args.random_seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    #******************************************************
    # load data
    #******************************************************
    processor = Processor(args)
    slot_meta = processor.slot_meta
    label_list = processor.label_list

    num_labels = [len(labels) for labels in label_list]
    logger.info(slot_meta)

    tokenizer = BertTokenizer.from_pretrained(args.pretrained_model)

    train_data_raw = processor.get_train_instances(args.data_dir, tokenizer)
    print("# train examples %d" % len(train_data_raw))

    dev_data_raw = processor.get_dev_instances(args.data_dir, tokenizer)
    print("# dev examples %d" % len(dev_data_raw))

    test_data_raw = processor.get_test_instances(args.data_dir, tokenizer)
    print("# test examples %d" % len(test_data_raw))
    logger.info("Data loaded!")

    train_data = MultiWozDataset(train_data_raw,
                                 tokenizer,
                                 word_dropout=args.word_dropout)

    num_train_steps = int(
        len(train_data_raw) / args.train_batch_size * args.n_epochs)
    logger.info("***** Run training *****")
    logger.info(" Num examples = %d", len(train_data_raw))
    logger.info(" Batch size = %d", args.train_batch_size)
    logger.info(" Num steps = %d", num_train_steps)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=train_data.collate_fn,
                                  num_workers=args.num_workers,
                                  worker_init_fn=worker_init_fn)

    #******************************************************
    # build model
    #******************************************************
    ## Initialize slot and value embeddings
    sv_encoder = UtteranceEncoding.from_pretrained(args.pretrained_model)
    for p in sv_encoder.bert.parameters():
        p.requires_grad = False

    new_label_list, slot_value_pos = combine_slot_values(
        slot_meta, label_list)  # without slot head
    logger.info(slot_value_pos)
    slot_lookup = get_label_lookup_from_first_token(slot_meta, tokenizer,
                                                    sv_encoder, device)
    value_lookup = get_label_lookup_from_first_token(new_label_list, tokenizer,
                                                     sv_encoder, device)

    model = BeliefTracker(args, slot_lookup, value_lookup, num_labels,
                          slot_value_pos, device)
    model.to(device)

    ## prepare optimizer
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    enc_param_optimizer = list(model.encoder.named_parameters())
    enc_optimizer_grouped_parameters = [{
        'params': [
            p for n, p in enc_param_optimizer
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
    enc_scheduler = get_linear_schedule_with_warmup(
        enc_optimizer, int(num_train_steps * args.enc_warmup), num_train_steps)

    dec_param_optimizer = list(model.decoder.parameters())
    dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
    dec_scheduler = get_linear_schedule_with_warmup(
        dec_optimizer, int(num_train_steps * args.dec_warmup), num_train_steps)

    logger.info(enc_optimizer)
    logger.info(dec_optimizer)

    #******************************************************
    # training
    #******************************************************
    logger.info("Training...")

    best_loss = None
    best_acc = None
    last_update = None
    for epoch in trange(int(args.n_epochs), desc="Epoch"):
        batch_loss = []
        batch_acc = []

        for step, batch in enumerate(tqdm(train_dataloader)):
            model.train()

            batch = [b.to(device) if b is not None else b for b in batch]
            input_ids, segment_ids, input_mask, label_ids = batch

            # forward
            loss, _, acc, _, _ = model(input_ids=input_ids,
                                       attention_mask=input_mask,
                                       token_type_ids=segment_ids,
                                       labels=label_ids)

            loss.backward()
            enc_optimizer.step()
            enc_scheduler.step()
            dec_optimizer.step()
            dec_scheduler.step()
            model.zero_grad()

            batch_loss.append(loss.item())
            batch_acc.append(acc)
            if step % 300 == 0:
                print("[%d/%d] [%d/%d] mean_loss: %.6f, mean_joint_acc: %.6f" % \
                      (epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), np.mean(batch_acc)))
                batch_loss = []
                batch_acc = []

            if epoch > args.n_epochs / 2 and step > 0 and step % args.eval_step == 0:
                eval_res = model_evaluation(model, dev_data_raw, tokenizer,
                                            slot_meta, label_list,
                                            epoch * 10 + step / args.eval_step)
                if last_update is None or best_loss > eval_res['loss']:
                    best_loss = eval_res['loss']
                    save_path = os.path.join(args.save_dir,
                                             'model_best_loss.bin')
                    torch.save(model.state_dict(), save_path)
                    print("Best Loss : ", best_loss)
                    print("\n")
                if last_update is None or best_acc < eval_res['joint_acc']:
                    best_acc = eval_res['joint_acc']
                    save_path = os.path.join(args.save_dir,
                                             'model_best_acc.bin')
                    torch.save(model.state_dict(), save_path)
                    print("Best Acc : ", best_acc)
                    print("\n")

                logger.info("*** Step=%d, Dev Loss=%.6f, Dev Acc=%.6f, Dev Turn Acc=%.6f, Best Loss=%.6f, Best Acc=%.6f ***" % \
                           (step, eval_res['loss'], eval_res['joint_acc'], eval_res['joint_turn_acc'], best_loss, best_acc))

            if epoch > args.n_epochs / 2 and step > 0 and step % args.eval_step == 0:
                eval_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, \
                                            label_list, epoch*10+step/args.eval_step)

                logger.info("*** Step=%d, Tes Loss=%.6f, Tes Acc=%.6f, Tes Turn Acc=%.6f, Best Loss=%.6f, Best Acc=%.6f ***" % \
                           (step, eval_res['loss'], eval_res['joint_acc'], eval_res['joint_turn_acc'], best_loss, best_acc))

        if (epoch + 1) % args.eval_epoch == 0:
            eval_res = model_evaluation(model, dev_data_raw, tokenizer,
                                        slot_meta, label_list, epoch + 1)
            if last_update is None or best_loss > eval_res['loss']:
                best_loss = eval_res['loss']
                save_path = os.path.join(args.save_dir, 'model_best_loss.bin')
                torch.save(model.state_dict(), save_path)
                print("Best Loss : ", best_loss)
                print("\n")
            if last_update is None or best_acc < eval_res['joint_acc']:
                best_acc = eval_res['joint_acc']
                save_path = os.path.join(args.save_dir, 'model_best_acc.bin')
                torch.save(model.state_dict(), save_path)
                last_update = epoch
                print("Best Acc : ", best_acc)
                print("\n")

            logger.info(
                "*** Epoch=%d, Last Update=%d, Dev Loss=%.6f, Dev Acc=%.6f, Dev Turn Acc=%.6f, Best Loss=%.6f, Best Acc=%.6f ***"
                % (epoch, last_update, eval_res['loss'], eval_res['joint_acc'],
                   eval_res['joint_turn_acc'], best_loss, best_acc))

        if (epoch + 1) % args.eval_epoch == 0:
            eval_res = model_evaluation(model, test_data_raw, tokenizer,
                                        slot_meta, label_list, epoch + 1)

            logger.info(
                "*** Epoch=%d, Last Update=%d, Tes Loss=%.6f, Tes Acc=%.6f, Tes Turn Acc=%.6f, Best Loss=%.6f, Best Acc=%.6f ***"
                % (epoch, last_update, eval_res['loss'], eval_res['joint_acc'],
                   eval_res['joint_turn_acc'], best_loss, best_acc))

        if last_update + args.patience <= epoch:
            break

    print("Test using best loss model...")
    best_epoch = 0
    ckpt_path = os.path.join(args.save_dir, 'model_best_loss.bin')
    model = BeliefTracker(args, slot_lookup, value_lookup, num_labels,
                          slot_value_pos, device)
    ckpt = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)
    model.to(device)

    test_res = model_evaluation(model,
                                test_data_raw,
                                tokenizer,
                                slot_meta,
                                label_list,
                                best_epoch,
                                is_gt_p_state=False)
    logger.info("Results based on best loss: ")
    logger.info(test_res)
    #----------------------------------------------------------------------
    print("Test using best acc model...")
    ckpt_path = os.path.join(args.save_dir, 'model_best_acc.bin')
    model = BeliefTracker(args, slot_lookup, value_lookup, num_labels,
                          slot_value_pos, device)
    ckpt = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)
    model.to(device)

    test_res = model_evaluation(model,
                                test_data_raw,
                                tokenizer,
                                slot_meta,
                                label_list,
                                best_epoch + 1,
                                is_gt_p_state=False)
    logger.info("Results based on best acc: ")
    logger.info(test_res)
Exemplo n.º 26
0
def train(args, train_dataset, model, tokenizer, ori_dict):
    record_result = []
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
        os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    pruning_step = 0
    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    # Added here for reproductibility
    set_seed(args)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
                if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                    inputs.update(
                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                    )

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()


            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        rate_weight_equal_zero = see_weight_rate(model)
                        print('zero_rate = ', rate_weight_equal_zero)
                        
                        results = evaluate(args, model, tokenizer)
                        print(results)
                        record_result.append(results)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                    pruning_model(model, 1/(10-pruning_step))
                    rate_weight_equal_zero = see_weight_rate(model)
                    pruning_step += 1
                    print('zero_rate = ', rate_weight_equal_zero)

                    print('starting rewinding')
                    model_dict = model.state_dict()
                    model_dict.update(ori_dict)
                    model.load_state_dict(model_dict)
                    
                    print('optimizer rewinding')
                    no_decay = ["bias", "LayerNorm.weight"]
                    optimizer_grouped_parameters = [
                        {
                            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                            "weight_decay": args.weight_decay,
                        },
                        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
                    ]
                    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
                    scheduler = get_linear_schedule_with_warmup(
                        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
                    )


                # Save model checkpoint
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    torch.save(model, os.path.join(output_dir, "model.pt"))

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
                    

            if pruning_step == 10:
                epoch_iterator.close()
                break

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

        if pruning_step == 10:
            epoch_iterator.close()
            break

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    torch.save(record_result, os.path.join(args.output_dir, "result.pt"))

    return global_step, tr_loss / global_step
Exemplo n.º 27
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .csv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints will be written.",
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.",
    )
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--num_train_epochs",
        default=3.0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # Prepare model
    model = BertForMultipleChoice.from_pretrained(
        args.bert_model,
        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                               "distributed_{}".format(args.local_rank)),
        num_choices=4,
    )
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_train:

        # Prepare data loader

        train_examples = read_swag_examples(os.path.join(
            args.data_dir, "train.csv"),
                                            is_training=True)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      args.max_seq_length,
                                                      True)
        all_input_ids = torch.tensor(select_field(train_features, "input_ids"),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features,
                                                   "input_mask"),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features,
                                                    "segment_ids"),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        num_train_optimization_steps = (len(train_dataloader) //
                                        args.gradient_accumulation_steps *
                                        args.num_train_epochs)
        if args.local_rank != -1:
            num_train_optimization_steps = (num_train_optimization_steps //
                                            torch.distributed.get_world_size())

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                bias_correction=False,
                max_grad_norm=1.0,
            )
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)
        else:
            optimizer = BertAdam(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps,
            )

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group["lr"] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForMultipleChoice.from_pretrained(args.output_dir,
                                                      num_choices=4)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
    else:
        model = BertForMultipleChoice.from_pretrained(args.bert_model,
                                                      num_choices=4)
    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = read_swag_examples(os.path.join(
            args.data_dir, "val.csv"),
                                           is_training=True)
        eval_features = convert_examples_to_features(eval_examples, tokenizer,
                                                     args.max_seq_length, True)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor(select_field(eval_features, "input_ids"),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(select_field(eval_features,
                                                   "input_mask"),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(eval_features,
                                                    "segment_ids"),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in eval_features],
                                 dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask,
                                      label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to("cpu").numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples

        result = {
            "eval_loss": eval_loss,
            "eval_accuracy": eval_accuracy,
            "global_step": global_step,
            "loss": tr_loss / global_step,
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model checkpoints and predictions will be written.")

    ## Other parameters
    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument("--predict_file", default=None, type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
    parser.add_argument("--max_seq_length", default=384, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--doc_stride", default=128, type=int,
                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
    parser.add_argument("--max_query_length", default=64, type=int,
                        help="The maximum number of tokens for the question. Questions longer than this will "
                             "be truncated to this length.")
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
                             "of training.")
    parser.add_argument("--n_best_size", default=20, type=int,
                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
                             "output file.")
    parser.add_argument("--max_answer_length", default=30, type=int,
                        help="The maximum length of an answer that can be generated. This is needed because the start "
                             "and end predictions are not conditioned on one another.")
    parser.add_argument("--verbose_logging", action='store_true',
                        help="If true, all of the warnings related to data processing will be printed. "
                             "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--version_2_with_negative',
                        action='store_true',
                        help='If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument('--null_score_diff_threshold',
                        type=float, default=0.0,
                        help="If null_score - best_non_null is greater than the threshold predict null.")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError("At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = read_squad_examples(
            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
    model = BertForQuestionAnswering.from_pretrained(args.bert_model,
                cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
        train_features = None
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s", cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                   all_start_positions, all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
                if n_gpu == 1:
                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used and handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
    else:
        model = BertForQuestionAnswering.from_pretrained(args.bert_model)

    model.to(device)

    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = read_squad_examples(
            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(RawResult(unique_id=unique_id,
                                             start_logits=start_logits,
                                             end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                          args.version_2_with_negative, args.null_score_diff_threshold)
Exemplo n.º 29
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_dev_acc = 0.0
    best_steps = 0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2]
                if args.model_type in ["bert", "xlnet"]
                else None,  # XLM don't use segment_ids
                "labels": batch[3],
            }
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (args.evaluate_during_training):
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                        if results["eval_acc"] > best_dev_acc:
                            best_dev_acc = results["eval_acc"]
                            best_steps = global_step
                            if args.do_test:
                                results_test = evaluate(args, model, tokenizer, test=True)
                                for key, value in results_test.items():
                                    tb_writer.add_scalar("test_{}".format(key), value, global_step)
                                logger.info(
                                    "test acc: %s, loss: %s, global steps: %s",
                                    str(results_test["eval_acc"]),
                                    str(results_test["eval_loss"]),
                                    str(global_step),
                                )

                        file0 = open("train_eval_logs_sqa_FIXED_180_tl.txt", "a") 
                        file0.write(str(results["eval_acc"]) + ','\
                        + str(results["eval_loss"]) + "," + \
                        str(global_step) + "\n") 
                        file0.close()


                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logger.info(
                        "Average loss: %s at global step: %s",
                        str((tr_loss - logging_loss) / args.logging_steps),
                        str(global_step),
                    )
                    

                    file1 = open("train_loss_logs_sqa_FIXED_180_tl.txt", "a")  # append mode 
                    file1.write(str((tr_loss - logging_loss) / args.logging_steps) + "," + \
                    str(global_step) + "\n") 
                    file1.close() 

                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_vocabulary(output_dir)
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step, best_steps
Exemplo n.º 30
0
                                                      random_state=20,
                                                      test_size=0.1)
Mask_train, Mask_valid, _, _ = train_test_split(attention_masks,
                                                X,
                                                random_state=20,
                                                test_size=0.1)

X_train = torch.tensor(X_train).to(device).long()
X_valid = torch.tensor(X_valid).to(device).long()
Y_train = torch.tensor(Y_train).to(device).long()
Y_valid = torch.tensor(Y_valid).to(device).long()
Mask_train = torch.tensor(Mask_train).to(device).long()
Mask_valid = torch.tensor(Mask_valid).to(device).long()

data_train = TensorDataset(X_train, Mask_train, Y_train)
data_train_sampler = RandomSampler(data_train)
DL_train = DataLoader(data_train,
                      sampler=data_train_sampler,
                      batch_size=batch_s)

data_valid = TensorDataset(X_valid, Mask_valid, Y_valid)
data_valid_sampler = SequentialSampler(data_valid)
DL_valid = DataLoader(data_valid,
                      sampler=data_valid_sampler,
                      batch_size=batch_s)

model = BertForTokenClassification.from_pretrained("hfl/chinese-bert-wwm",
                                                   num_labels=len(tag2idx))
model.cuda()

FULL_FINETUNING = False