示例#1
0
    def __init__(self, args):
        super().__init__()
        embedding_dim = 768
        self.roberta = RobertaForSequenceClassification.from_pretrained(
            'roberta-base', return_dict=True, output_hidden_states=True)

        self.dense = nn.Linear(embedding_dim, embedding_dim)
        self.layer_norm = nn.LayerNorm(768)
        self.init_weights(self.dense)
示例#2
0
 def __init__(self, args: argparse.Namespace):
     super().__init__()
     self.args = args
     self.tokenizer = RobertaTokenizer.from_pretrained(
         self.args.roberta_path)
     self.model = RobertaForSequenceClassification.from_pretrained(
         self.args.roberta_path)
     self.loss_fn = CrossEntropyLoss()
     self.metric = Accuracy(num_classes=2)
     self.num_gpus = len(str(self.args.gpus).split(","))
 def __init__(
     self,
     args: argparse.Namespace
 ):
     """Initialize a model, tokenizer and config."""
     super().__init__()
     self.args = args
     if isinstance(args, argparse.Namespace):
         self.save_hyperparameters(args)
     self.bert_dir = args.bert_path
     self.model = RobertaForSequenceClassification.from_pretrained(self.bert_dir)
     self.tokenizer = RobertaTokenizer.from_pretrained(self.bert_dir)
     self.loss_fn = CrossEntropyLoss()
     self.train_acc = pl.metrics.Accuracy()
     self.valid_acc = pl.metrics.Accuracy()
示例#4
0
    def __init__(self, args: argparse.Namespace):
        super().__init__()
        self.args = args
        self.tokenizer = RobertaTokenizer.from_pretrained(
            self.args.roberta_path)
        # self.model = RobertaForSequenceClassification.from_pretrained(self.args.roberta_path)

        self.robert_config = RobertaConfig.from_pretrained(
            self.args.roberta_path, output_hidden_states=False)
        self.model = RobertaForSequenceClassification(self.robert_config)

        self.loss_fn = CrossEntropyLoss()
        self.metric = Accuracy(num_classes=2)
        gpus_string = self.args.gpus if not self.args.gpus.endswith(
            ',') else self.args.gpus[:-1]
        self.num_gpus = len(gpus_string.split(","))

        self.predict_neg = []
        self.predict_pos = []
示例#5
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    processors = {"rte": RteProcessor}

    output_modes = {"rte": "classification"}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu)
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]
    label_list = processor.get_labels()

    num_labels = len(["entailment", "neutral", "contradiction"])
    # pretrain_model_dir = 'roberta-large' #'roberta-large' , 'roberta-large-mnli'
    pretrain_model_dir = '/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/RoBERTa_on_MNLI_SNLI_SciTail_RTE_ANLI_SpecialToken_epoch_2_acc_4.156359461121103'  #'roberta-large' , 'roberta-large-mnli'
    model = RobertaForSequenceClassification.from_pretrained(
        pretrain_model_dir, num_labels=num_labels)
    tokenizer = RobertaTokenizer.from_pretrained(
        pretrain_model_dir, do_lower_case=args.do_lower_case)
    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    #MNLI-SNLI-SciTail-RTE-SICK
    train_examples_MNLI, dev_examples_MNLI = processor.get_MNLI_train_and_dev(
        '/export/home/Dataset/glue_data/MNLI/train.tsv',
        '/export/home/Dataset/glue_data/MNLI/dev_mismatched.tsv'
    )  #train_pu_half_v1.txt
    train_examples_SNLI, dev_examples_SNLI = processor.get_SNLI_train_and_dev(
        '/export/home/Dataset/glue_data/SNLI/train.tsv',
        '/export/home/Dataset/glue_data/SNLI/dev.tsv')
    train_examples_SciTail, dev_examples_SciTail = processor.get_SciTail_train_and_dev(
        '/export/home/Dataset/SciTailV1/tsv_format/scitail_1.0_train.tsv',
        '/export/home/Dataset/SciTailV1/tsv_format/scitail_1.0_dev.tsv')
    train_examples_RTE, dev_examples_RTE = processor.get_RTE_train_and_dev(
        '/export/home/Dataset/glue_data/RTE/train.tsv',
        '/export/home/Dataset/glue_data/RTE/dev.tsv')
    train_examples_ANLI, dev_examples_ANLI = processor.get_ANLI_train_and_dev(
        'train', 'dev',
        '/export/home/Dataset/para_entail_datasets/ANLI/anli_v0.1/')

    train_examples = train_examples_MNLI + train_examples_SNLI + train_examples_SciTail + train_examples_RTE + train_examples_ANLI
    dev_examples_list = [
        dev_examples_MNLI, dev_examples_SNLI, dev_examples_SciTail,
        dev_examples_RTE, dev_examples_ANLI
    ]

    dev_task_label = [0, 0, 1, 1, 0]
    task_names = ['MNLI', 'SNLI', 'SciTail', 'RTE', 'ANLI']
    '''filter challenging neighbors'''
    neighbor_id_list = []
    readfile = codecs.open('neighbors_indices_before_dropout_eud.v3.txt', 'r',
                           'utf-8')
    for line in readfile:
        neighbor_id_list.append(int(line.strip()))
    readfile.close()
    print('neighbor_id_list size:', len(neighbor_id_list))
    truncated_train_examples = [train_examples[i] for i in neighbor_id_list]
    train_examples = truncated_train_examples

    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    max_test_acc = 0.0
    max_dev_acc = 0.0

    train_features = convert_examples_to_features(
        train_examples,
        label_list,
        args.max_seq_length,
        tokenizer,
        output_mode,
        cls_token_at_end=
        False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=
        True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=
        False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=0)  #4 if args.model_type in ['xlnet'] else 0,)

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)
    all_task_label_ids = torch.tensor([f.task_label for f in train_features],
                                      dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids, all_task_label_ids)
    train_sampler = RandomSampler(train_data)

    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  drop_last=True)
    '''dev data to features'''
    valid_dataloader_list = []
    for valid_examples_i in dev_examples_list:
        valid_features = convert_examples_to_features(
            valid_examples_i,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)

        logger.info("***** valid_examples *****")
        logger.info("  Num examples = %d", len(valid_examples_i))
        valid_input_ids = torch.tensor([f.input_ids for f in valid_features],
                                       dtype=torch.long)
        valid_input_mask = torch.tensor([f.input_mask for f in valid_features],
                                        dtype=torch.long)
        valid_segment_ids = torch.tensor(
            [f.segment_ids for f in valid_features], dtype=torch.long)
        valid_label_ids = torch.tensor([f.label_id for f in valid_features],
                                       dtype=torch.long)
        valid_task_label_ids = torch.tensor(
            [f.task_label for f in valid_features], dtype=torch.long)

        valid_data = TensorDataset(valid_input_ids, valid_input_mask,
                                   valid_segment_ids, valid_label_ids,
                                   valid_task_label_ids)
        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data,
                                      sampler=valid_sampler,
                                      batch_size=args.eval_batch_size)
        valid_dataloader_list.append(valid_dataloader)

    iter_co = 0
    for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"):
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, task_label_ids = batch
            logits = model(input_ids, input_mask, None, labels=None)

            prob_matrix = F.log_softmax(logits[0].view(-1, num_labels), dim=1)
            '''this step *1.0 is very important, otherwise bug'''
            new_prob_matrix = prob_matrix * 1.0
            '''change the entail prob to p or 1-p'''
            changed_places = torch.nonzero(task_label_ids, as_tuple=False)
            new_prob_matrix[changed_places,
                            0] = 1.0 - prob_matrix[changed_places, 0]

            loss = F.nll_loss(new_prob_matrix, label_ids.view(-1))

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            iter_co += 1

            # if iter_co % len(train_dataloader) ==0:
            if iter_co % (len(train_dataloader) // 5) == 0:
                '''
                start evaluate on  dev set after this epoch
                '''
                # if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
                #     model = torch.nn.DataParallel(model)
                model.eval()
                for m in model.modules():
                    if isinstance(m, torch.nn.BatchNorm2d):
                        m.track_running_stats = False
                # logger.info("***** Running evaluation *****")
                # logger.info("  Num examples = %d", len(valid_examples_MNLI))
                # logger.info("  Batch size = %d", args.eval_batch_size)

                dev_acc_sum = 0.0
                for idd, valid_dataloader in enumerate(valid_dataloader_list):
                    task_label = dev_task_label[idd]
                    eval_loss = 0
                    nb_eval_steps = 0
                    preds = []
                    gold_label_ids = []
                    # print('Evaluating...', task_label)
                    # for _, batch in enumerate(tqdm(valid_dataloader, desc=task_names[idd])):
                    for _, batch in enumerate(valid_dataloader):
                        batch = tuple(t.to(device) for t in batch)
                        input_ids, input_mask, segment_ids, label_ids, task_label_ids = batch
                        if task_label == 0:
                            gold_label_ids += list(
                                label_ids.detach().cpu().numpy())
                        else:
                            '''SciTail, RTE'''
                            task_label_ids_list = list(
                                task_label_ids.detach().cpu().numpy())
                            gold_label_batch_fake = list(
                                label_ids.detach().cpu().numpy())
                            for ex_id, label_id in enumerate(
                                    gold_label_batch_fake):
                                if task_label_ids_list[ex_id] == 0:
                                    gold_label_ids.append(label_id)  #0
                                else:
                                    gold_label_ids.append(1)  #1
                        with torch.no_grad():
                            logits = model(input_ids=input_ids,
                                           attention_mask=input_mask,
                                           token_type_ids=None,
                                           labels=None)
                        logits = logits[0]
                        if len(preds) == 0:
                            preds.append(logits.detach().cpu().numpy())
                        else:
                            preds[0] = np.append(preds[0],
                                                 logits.detach().cpu().numpy(),
                                                 axis=0)

                    preds = preds[0]
                    pred_probs = softmax(preds, axis=1)
                    pred_label_ids_3way = np.argmax(pred_probs, axis=1)
                    if task_label == 0:
                        '''3-way tasks MNLI, SNLI, ANLI'''
                        pred_label_ids = pred_label_ids_3way
                    else:
                        '''SciTail, RTE'''
                        pred_label_ids = []
                        for pred_label_i in pred_label_ids_3way:
                            if pred_label_i == 0:
                                pred_label_ids.append(0)
                            else:
                                pred_label_ids.append(1)
                    assert len(pred_label_ids) == len(gold_label_ids)
                    hit_co = 0
                    for k in range(len(pred_label_ids)):
                        if pred_label_ids[k] == gold_label_ids[k]:
                            hit_co += 1
                    test_acc = hit_co / len(gold_label_ids)
                    dev_acc_sum += test_acc
                    print(task_names[idd], ' dev acc:', test_acc)
                '''store the model, because we can test after a max_dev acc reached'''
                model_to_save = (
                    model.module if hasattr(model, "module") else model
                )  # Take care of distributed/parallel training
                store_transformers_models(
                    model_to_save, tokenizer,
                    '/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/',
                    'RoBERTa_on_MNLI_SNLI_SciTail_RTE_ANLI_SpecialToken_Filter_1_epoch_'
                    + str(epoch_i) + '_acc_' + str(dev_acc_sum))
示例#6
0
def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path: str,
                                          pytorch_dump_folder_path: str,
                                          classification_head: bool):
    """
    Copy/paste/tweak roberta's weights to our BERT structure.
    """
    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
    roberta.eval()  # disable dropout
    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
    config = RobertaConfig(
        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
        hidden_size=roberta.args.encoder_embed_dim,
        num_hidden_layers=roberta.args.encoder_layers,
        num_attention_heads=roberta.args.encoder_attention_heads,
        intermediate_size=roberta.args.encoder_ffn_embed_dim,
        max_position_embeddings=514,
        type_vocab_size=1,
        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
    )
    if classification_head:
        config.num_labels = roberta.args.num_classes
    print("Our BERT config:", config)

    model = RobertaForSequenceClassification(
        config) if classification_head else RobertaForMaskedLM(config)
    model.eval()

    # Now let's copy all the weights.
    # Embeddings
    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # just zero them out b/c RoBERTa doesn't use them.
    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias

    for i in range(config.num_hidden_layers):
        # Encoder: start of layer
        layer: BertLayer = model.roberta.encoder.layer[i]
        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[
            i]

        # self attention
        self_attn: BertSelfAttention = layer.attention.self
        assert (roberta_layer.self_attn.k_proj.weight.data.shape ==
                roberta_layer.self_attn.q_proj.weight.data.shape ==
                roberta_layer.self_attn.v_proj.weight.data.shape == torch.Size(
                    (config.hidden_size, config.hidden_size)))

        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias

        # self-attention output
        self_output: BertSelfOutput = layer.attention.output
        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias

        # intermediate
        intermediate: BertIntermediate = layer.intermediate
        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
        intermediate.dense.weight = roberta_layer.fc1.weight
        intermediate.dense.bias = roberta_layer.fc1.bias

        # output
        bert_output: BertOutput = layer.output
        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
        bert_output.dense.weight = roberta_layer.fc2.weight
        bert_output.dense.bias = roberta_layer.fc2.bias
        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
        # end of layer

    if classification_head:
        model.classifier.dense.weight = roberta.model.classification_heads[
            "mnli"].dense.weight
        model.classifier.dense.bias = roberta.model.classification_heads[
            "mnli"].dense.bias
        model.classifier.out_proj.weight = roberta.model.classification_heads[
            "mnli"].out_proj.weight
        model.classifier.out_proj.bias = roberta.model.classification_heads[
            "mnli"].out_proj.bias
    else:
        # LM Head
        model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
        model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
        model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
        model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
        model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
        model.lm_head.decoder.bias = roberta.model.decoder.lm_head.bias

    # Let's check that we get the same results.
    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(
        0)  # batch of size 1

    our_output = model(input_ids)[0]
    if classification_head:
        their_output = roberta.model.classification_heads["mnli"](
            roberta.extract_features(input_ids))
    else:
        their_output = roberta.model(input_ids)[0]
    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
    success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if success else "💩")
    if not success:
        raise Exception("Something went wRoNg")

    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
示例#7
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        num_labels = glue_tasks_num_labels[data_args.task_name]
        output_mode = glue_output_modes[data_args.task_name]
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,  #2 classes
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )
    '''update the roberta parameters by my 3-way model'''
    model_roberta = RobertaForSequenceClassification.from_pretrained(
        '/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/RoBERTa_on_MNLI_SNLI_SciTail_RTE_ANLI_SpecialToken_Filter_1_epoch_51_acc_4.199802825942953',
        num_labels=3)
    # model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli', num_labels=3)
    model.roberta.load_state_dict(model_roberta.roberta.state_dict())

    # Get datasets
    train_dataset = (GlueDataset(
        data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                     if training_args.do_train else None)
    eval_dataset = (GlueDataset(data_args,
                                tokenizer=tokenizer,
                                mode="dev",
                                cache_dir=model_args.cache_dir)
                    if training_args.do_eval else None)
    test_dataset = (GlueDataset(data_args,
                                tokenizer=tokenizer,
                                mode="test",
                                cache_dir=model_args.cache_dir)
                    if training_args.do_predict else None)

    def build_compute_metrics_fn(
            task_name: str) -> Callable[[EvalPrediction], Dict]:
        def compute_metrics_fn(p: EvalPrediction):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(data_args.task_name),
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            mnli_mm_data_args = dataclasses.replace(data_args,
                                                    task_name="mnli-mm")
            eval_datasets.append(
                GlueDataset(mnli_mm_data_args,
                            tokenizer=tokenizer,
                            mode="dev",
                            cache_dir=model_args.cache_dir))

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = build_compute_metrics_fn(
                eval_dataset.args.task_name)
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(
                training_args.output_dir,
                f"eval_results_{eval_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results {} *****".format(
                        eval_dataset.args.task_name))
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

    if training_args.do_predict:
        logging.info("*** Test ***")
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            mnli_mm_data_args = dataclasses.replace(data_args,
                                                    task_name="mnli-mm")
            test_datasets.append(
                GlueDataset(mnli_mm_data_args,
                            tokenizer=tokenizer,
                            mode="test",
                            cache_dir=model_args.cache_dir))

        for test_dataset in test_datasets:
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            if output_mode == "classification":
                predictions = np.argmax(predictions, axis=1)

            output_test_file = os.path.join(
                training_args.output_dir,
                f"test_results_{test_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_test_file, "w") as writer:
                    logger.info("***** Test results {} *****".format(
                        test_dataset.args.task_name))
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if output_mode == "regression":
                            writer.write("%d\t%3.3f\n" % (index, item))
                        else:
                            item = test_dataset.get_labels()[item]
                            writer.write("%d\t%s\n" % (index, item))
    return eval_results
示例#8
0
 def __init__(self, config, model_argobj=None):
     NLL.__init__(self, model_argobj)
     RobertaForSequenceClassification.__init__(self, config)
     self.embeddingHead = nn.Linear(config.hidden_size, 768)
     self.norm = nn.LayerNorm(768)
     self.apply(self._init_weights)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    processors = {"rte": RteProcessor}

    output_modes = {"rte": "classification"}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    num_labels = len(["entailment", "neutral", "contradiction"])
    pretrain_model_dir = 'roberta-large-mnli'  #'roberta-large' , 'roberta-large-mnli'
    model = RobertaForSequenceClassification.from_pretrained(
        pretrain_model_dir, num_labels=num_labels)
    tokenizer = RobertaTokenizer.from_pretrained(
        pretrain_model_dir, do_lower_case=args.do_lower_case)
    model.to(device)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    valid_examples_MNLI, label_list_MNLI = processor.get_MNLI_as_train(
        '/export/home/Dataset/glue_data/MNLI/dev_mismatched.tsv')

    valid_features = convert_examples_to_features(
        valid_examples_MNLI,
        label_list_MNLI,
        args.max_seq_length,
        tokenizer,
        output_mode,
        cls_token_at_end=
        False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=
        True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=
        False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=0)  #4 if args.model_type in ['xlnet'] else 0,)

    logger.info("***** valid_examples *****")
    logger.info("  Num examples = %d", len(valid_examples_MNLI))
    valid_input_ids = torch.tensor([f.input_ids for f in valid_features],
                                   dtype=torch.long)
    valid_input_mask = torch.tensor([f.input_mask for f in valid_features],
                                    dtype=torch.long)
    valid_segment_ids = torch.tensor([f.segment_ids for f in valid_features],
                                     dtype=torch.long)
    valid_label_ids = torch.tensor([f.label_id for f in valid_features],
                                   dtype=torch.long)

    valid_data = TensorDataset(valid_input_ids, valid_input_mask,
                               valid_segment_ids, valid_label_ids)
    valid_sampler = SequentialSampler(valid_data)

    valid_dataloader = DataLoader(valid_data,
                                  sampler=valid_sampler,
                                  batch_size=args.eval_batch_size)

    #MNLI-SNLI-SciTail-RTE-SICK
    train_examples_MNLI, label_list_MNLI = processor.get_MNLI_as_train(
        '/export/home/Dataset/glue_data/MNLI/train.tsv')  #train_pu_half_v1.txt
    train_examples_SNLI, label_list_SNLI = processor.get_SNLI_as_train(
        '/export/home/Dataset/glue_data/SNLI/train.tsv')
    # train_examples_SciTail, label_list_SciTail = processor.get_SciTail_as_train('/export/home/Dataset/SciTailV1/tsv_format/scitail_1.0_train.tsv')
    # train_examples_RTE, label_list_RTE = processor.get_RTE_as_train('/export/home/Dataset/glue_data/RTE/train.tsv')
    # train_examples_SICK = processor.get_SICK_as_train('/export/home/Dataset/glue_data/RTE/train.tsv')
    '''iter over each dataset'''
    dataset_name_list = ['MNLI', 'SNLI']
    dataset_list = [train_examples_MNLI, train_examples_SNLI]
    dataset_label_list = [label_list_MNLI, label_list_SNLI]
    for dataset_id, train_examples in enumerate(dataset_list):
        label_list = dataset_label_list[dataset_id]
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

        global_step = 0
        nb_tr_steps = 0
        tr_loss = 0
        max_test_acc = 0.0
        max_dev_acc = 0.0

        train_features = convert_examples_to_features(
            train_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        train_sampler = RandomSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        iter_co = 0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                logits = model(input_ids, input_mask, None, labels=None)
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits[0].view(-1, num_labels),
                                label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                iter_co += 1
                if iter_co % 500:
                    print('loss........:', loss)
                if iter_co % len(train_dataloader) == 0:
                    '''
                    start evaluate on MNLI dev set after this epoch
                    '''
                    model.eval()
                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d",
                                len(valid_examples_MNLI))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    eval_loss = 0
                    nb_eval_steps = 0
                    preds = []
                    gold_label_ids = []
                    print('Evaluating...')
                    for input_ids, input_mask, segment_ids, label_ids in valid_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)
                        gold_label_ids += list(
                            label_ids.detach().cpu().numpy())

                        with torch.no_grad():
                            logits = model(input_ids,
                                           input_mask,
                                           None,
                                           labels=None)
                        logits = logits[0]
                        if len(preds) == 0:
                            preds.append(logits.detach().cpu().numpy())
                        else:
                            preds[0] = np.append(preds[0],
                                                 logits.detach().cpu().numpy(),
                                                 axis=0)

                    preds = preds[0]
                    pred_probs = softmax(preds, axis=1)
                    pred_label_ids = np.argmax(pred_probs, axis=1)
                    assert len(pred_label_ids) == len(gold_label_ids)
                    hit_co = 0
                    for k in range(len(pred_label_ids)):
                        if pred_label_ids[k] == gold_label_ids[k]:
                            hit_co += 1
                    test_acc = hit_co / len(gold_label_ids)
                    print('valid acc:', test_acc)
        '''store the model, because we can test after a max_dev acc reached'''
        store_transformers_models(
            model, tokenizer,
            '/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/',
            '->'.join(dataset_name_list[:dataset_id + 1]))
    def __init__(self, args):
        super().__init__()
        embedding_dim = 768
        self.roberta = RobertaForSequenceClassification.from_pretrained(
            'roberta-base', return_dict=True, output_hidden_states=True)

        #self.bert.config = sm.config
        #self.roberta.resize_token_embeddings(119567)
        #self.tie_weights()

        self.dense = nn.Linear(embedding_dim, embedding_dim)
        self.layer_norm = nn.LayerNorm(768)
        self.init_weights(self.dense)
        field = args.field

        if field == 'sparse_16_title':
            self.his_len = 16
            self.set_len = 32
        elif field == 'sparse_60_title':
            self.his_len = 60
            self.set_len = 32
        elif field == 'sparse_60_cat':
            self.his_len = 60
            self.set_len = 32
        elif field == 'sparse_20_cat_abs':
            self.his_len = 20
            self.set_len = 96
        elif field == 'sparse_120_title':
            self.his_len = 120
            self.set_len = 32
        elif field == 'sparse_120_cat':
            self.his_len = 120
            self.set_len = 32
        elif field == 'sparse_40_cat_abs':
            self.his_len = 40
            self.set_len = 96
        elif field == 'sparse_60_cat_abs':
            self.his_len = 60
            self.set_len = 96
        elif field == 'sparse_60_title_last':
            self.his_len = 60
            self.set_len = 32
        elif field == 'sparse_60_cat_last':
            self.his_len = 60
            self.set_len = 32
        elif field == 'sparse_80_title_reverse':
            self.his_len = 80
            self.set_len = 32
        elif field == 'sparse_80_title_non_reverse':
            self.his_len = 80
            self.set_len = 32
        elif field == 'sparse_16_title_reverse':
            self.his_len = 16
            self.set_len = 32
        elif field == 'sparse_16_title_non_reverse':
            self.his_len = 16
            self.set_len = 32
        sm = SparseRobertaForSequenceClassification.from_pretrained(
            'roberta-base', return_dict=True, output_hidden_states=True)

        if 'reverse' in field:
            sm.make_long_and_sparse(
                self.his_len * self.set_len, "variable", 16, False,
                [32] * int(self.set_len * self.his_len / 512), [0])
            self.atten_mask = torch.zeros(
                (self.his_len * self.set_len, self.his_len * self.set_len))
        elif 'last' in field:
            sm.make_long_and_sparse(
                self.his_len * self.set_len + 10 * 64, "longformer", 16, True,
                self.set_len,
                list(
                    range(0,
                          int(self.set_len / 16) * self.his_len,
                          int(self.set_len / 16))))
            self.atten_mask = torch.zeros(
                (self.his_len * self.set_len + 10 * 64,
                 self.his_len * self.set_len + 10 * 64))
        else:
            sm.make_long_and_sparse(
                self.his_len * self.set_len, "longformer", 16, True,
                self.set_len,
                list(
                    range(0,
                          int(self.set_len / 16) * self.his_len,
                          int(self.set_len / 16))))
            self.atten_mask = torch.zeros(
                (self.his_len * self.set_len, self.his_len * self.set_len))

        self.sparse_roberta = sm.roberta

        self.atten_mask[0, :] = 1
        if 'non_reverse' in field:
            self.atten_mask[:, 0] = 1
            # self.atten_mask[0:512,0:512]=1
            # self.atten_mask[512:1024,512:1024]=1
            # self.atten_mask[1024:1536,1024:1536]=1
            # self.atten_mask[1536:2048,1536:2048]=1
            # self.atten_mask[2048:2560,2048:2560]=1
            for item in range(0, self.set_len * self.his_len, 512):
                self.atten_mask[item:item + 512, item:item + 512] = 1

        elif 'reverse' in field:
            self.atten_mask = None

        # if 'reverse' in field:
        #     self.atten_mask[:,0]=1
        #     for item in range(0,self.set_len*self.his_len,512):
        #         self.atten_mask[item:item+512,item:item+512]=1

        elif 'last' not in field:
            self.atten_mask[:, 1] = 1
            for item in range(0,
                              int(self.set_len / 16) * self.his_len,
                              int(self.set_len / 16)):
                start = item * 16
                # self.atten_mask[start,:]=1#global
                self.atten_mask[:, start] = 1
            for item in range(self.his_len):
                self.atten_mask[item * self.set_len:(item + 1) * self.set_len,
                                item * self.set_len:(item + 1) *
                                self.set_len] = 1
        else:
            self.atten_mask[:, 1] = 1
            for item in range(0,
                              int(self.set_len / 16) * (self.his_len - 10),
                              int(self.set_len / 16)):
                start = item * 16
                self.atten_mask[:, start] = 1

            for item in range(
                    int(self.set_len / 16) * (self.his_len - 10),
                    int(self.set_len / 16) * (self.his_len - 10) + int(
                        (self.set_len + 64) / 16) * 10,
                    int((self.set_len + 64) / 16)):
                start = item * 16
                self.atten_mask[:, start] = 1

            for item in range(self.his_len - 10):
                self.atten_mask[item * self.set_len:(item + 1) * self.set_len,
                                item * self.set_len:(item + 1) *
                                self.set_len] = 1
            for item in range(self.his_len - 10, self.his_len):
                start = 50 * self.set_len + (item - 50) * (self.set_len + 64)
                end = start + (self.set_len + 64)
                self.atten_mask[start:end, start:end] = 1
                #print('????',start,end,self.atten_mask[start:end,start:end])

        self.field = field
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    processors = {"rte": RteProcessor}

    output_modes = {"rte": "classification"}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu)
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]
    label_list = [0, 1]

    num_labels = len(label_list)
    pretrain_model_dir = 'roberta-large'  #'roberta-large' , 'roberta-large-mnli'
    # pretrain_model_dir = '/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/RoBERTa_on_MNLI_SNLI_SciTail_RTE_ANLI_SpecialToken_epoch_2_acc_4.156359461121103' #'roberta-large' , 'roberta-large-mnli'
    model = RobertaForSequenceClassification.from_pretrained(
        pretrain_model_dir, num_labels=num_labels)
    tokenizer = RobertaTokenizer.from_pretrained(
        pretrain_model_dir, do_lower_case=args.do_lower_case)
    '''update the roberta parameters by my 3-way model'''
    # model_roberta = RobertaForSequenceClassification.from_pretrained('/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/RoBERTa_on_MNLI_SNLI_SciTail_RTE_ANLI_SpecialToken_Filter_1_epoch_51_acc_4.199802825942953', num_labels=3)
    # model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli', num_labels=3)
    # model.roberta.load_state_dict(model_roberta.roberta.state_dict())

    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # processor.prepare_MRPC_labeled_set()
    # exit(0)
    train_examples, dev_examples, test_examples = processor.get_MRPC(
        '/export/home/Dataset/glue_data/MRPC/')
    # train_examples = train_examples_MNLI+train_examples_SNLI+train_examples_SciTail+train_examples_RTE+train_examples_ANLI
    dev_examples_list = [dev_examples, test_examples]

    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    max_test_acc = 0.0
    max_dev_acc = 0.0

    train_features = convert_examples_to_features(
        train_examples,
        label_list,
        args.max_seq_length,
        tokenizer,
        output_mode,
        cls_token_at_end=
        False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=
        True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=
        False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=0)  #4 if args.model_type in ['xlnet'] else 0,)

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)
    # all_task_label_ids = torch.tensor([f.task_label for f in train_features], dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids)
    train_sampler = RandomSampler(train_data)

    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  drop_last=True)
    '''dev data to features'''
    valid_dataloader_list = []
    for valid_examples_i in dev_examples_list:
        valid_features = convert_examples_to_features(
            valid_examples_i,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)

        logger.info("***** valid_examples *****")
        logger.info("  Num examples = %d", len(valid_examples_i))
        valid_input_ids = torch.tensor([f.input_ids for f in valid_features],
                                       dtype=torch.long)
        valid_input_mask = torch.tensor([f.input_mask for f in valid_features],
                                        dtype=torch.long)
        valid_segment_ids = torch.tensor(
            [f.segment_ids for f in valid_features], dtype=torch.long)
        valid_label_ids = torch.tensor([f.label_id for f in valid_features],
                                       dtype=torch.long)
        # valid_task_label_ids = torch.tensor([f.task_label for f in valid_features], dtype=torch.long)

        valid_data = TensorDataset(valid_input_ids, valid_input_mask,
                                   valid_segment_ids, valid_label_ids)
        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data,
                                      sampler=valid_sampler,
                                      batch_size=args.eval_batch_size)
        valid_dataloader_list.append(valid_dataloader)

    iter_co = 0
    max_dev_acc = 0.0
    max_dev_f1 = 0.0
    max_test_acc = 0.0
    max_test_f1 = 0.0
    for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"):
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            logits = model(input_ids, input_mask, None, labels=None)

            prob_matrix = F.log_softmax(logits[0].view(-1, num_labels), dim=1)
            loss = F.nll_loss(prob_matrix, label_ids.view(-1))

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            iter_co += 1

            if iter_co % len(train_dataloader) == 0:
                # if iter_co % (len(train_dataloader)//5) ==0:
                '''
                start evaluate on  dev set after this epoch
                '''
                # if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
                #     model = torch.nn.DataParallel(model)
                model.eval()

                dev_acc_sum = 0.0
                for idd, valid_dataloader in enumerate(valid_dataloader_list):
                    preds = []
                    gold_label_ids = []

                    for _, batch in enumerate(valid_dataloader):
                        batch = tuple(t.to(device) for t in batch)
                        input_ids, input_mask, segment_ids, label_ids = batch
                        gold_label_ids += list(
                            label_ids.detach().cpu().numpy())

                        with torch.no_grad():
                            logits = model(input_ids=input_ids,
                                           attention_mask=input_mask,
                                           token_type_ids=None,
                                           labels=None)
                        logits = logits[0]
                        if len(preds) == 0:
                            preds.append(logits.detach().cpu().numpy())
                        else:
                            preds[0] = np.append(preds[0],
                                                 logits.detach().cpu().numpy(),
                                                 axis=0)

                    preds = preds[0]
                    pred_probs = softmax(preds, axis=1)
                    pred_label_ids = np.argmax(pred_probs, axis=1)

                    assert len(pred_label_ids) == len(gold_label_ids)
                    hit_co = 0
                    overlap = 0
                    for k in range(len(pred_label_ids)):
                        if pred_label_ids[k] == gold_label_ids[k]:
                            hit_co += 1
                            if gold_label_ids[k] == 1:
                                overlap += 1
                    test_acc = hit_co / len(gold_label_ids)

                    precision = overlap / (1e-6 + sum(pred_label_ids))
                    recall = overlap / (1e-6 + sum(gold_label_ids))
                    f1 = 2 * precision * recall / (precision + recall + 1e-6)

                    if idd == 0:  # is dev
                        if f1 > max_dev_f1:
                            max_dev_f1 = f1
                            if test_acc > max_dev_acc:
                                max_dev_acc = test_acc
                            print('\ncurrent dev f1:', f1, ' acc:', test_acc,
                                  ' max dev f1:', max_dev_f1, 'max_dev_acc:',
                                  max_dev_acc)
                        else:
                            print('\ncurrent dev f1:', f1, ' acc:', test_acc,
                                  ' max dev f1:', max_dev_f1, 'max_dev_acc:',
                                  max_dev_acc)
                            break
                    else:  # test
                        if f1 > max_test_f1:
                            max_test_f1 = f1
                            if test_acc > max_test_acc:
                                max_test_acc = test_acc
                            print('\ncurrent test f1:', f1, ' acc:', test_acc,
                                  ' max test f1:', max_test_f1,
                                  'max_test_acc:', max_test_acc)
                        else:
                            print('\ncurrent test f1:', f1, ' acc:', test_acc,
                                  ' max test f1:', max_test_f1,
                                  'max_test_acc:', max_test_acc)