Python AdamW.AdamW示例

编程语言: Python

命名空间/包名称: transformers.optimization

类/类型: AdamW

方法/功能: AdamW

hotexamples.com的示例: 4

Python AdamW.AdamW - 已找到4个示例。这些是从开源项目中提取的最受好评的transformers.optimization.AdamW.AdamW现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

AdamW(30)

step(30)

zero_grad(30)

load_state_dict(17)

state_dict(14)

accum_interval(1)

clip_value(1)

示例#1

显示文件

    params += list(model_tag.parameters())
    if opt.task_sc:
        params += list(model_class.parameters())
    params = list(filter(lambda p: p.requires_grad, params))
    named_params = []
    named_params += list(model_tag.named_parameters())
    if opt.task_sc:
        named_params += list(model_class.named_parameters())
    named_params = list(filter(lambda p: p[1].requires_grad, named_params))
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in named_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in named_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    num_train_optimization_steps = len(train_feats['data']) // opt.batchSize * opt.max_epoch
    optimizer = AdamW(optimizer_grouped_parameters, lr=opt.lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=int(opt.warmup_proportion * num_train_optimization_steps), t_total=num_train_optimization_steps)  # PyTorch scheduler

# prepare_inputs_for_bert(sentences, word_lengths)

def decode(data_feats, data_tags, data_class, output_path):
    data_index = np.arange(len(data_feats))
    losses = []
    TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0
    TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0
    with open(output_path, 'w') as f:
        for j in range(0, len(data_index), opt.test_batchSize):
            if opt.testing:
                words, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class(data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=opt.enc_dec, device=opt.device)
            else:
                words, tags, raw_tags, classes, raw_classes, lens = data_reader.get_minibatch_with_class(data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=opt.enc_dec, device=opt.device)

示例#2

显示文件

    params.train_size = train_data['size']
    params.val_size = val_data['size']
    
    logging.info("Loading BERT model...")

    # Prepare model
    model = BertForSequenceTagging.from_pretrained(bert_class, num_labels=len(params.tag2idx))
    model.to(params.device)

    # Prepare optimizer
    if params.full_finetuning:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
             'weight_decay': params.weight_decay},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
             'weight_decay': 0.0}
        ]
    else: # only finetune the head classifier
        param_optimizer = list(model.classifier.named_parameters()) 
        optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]
    

    optimizer = AdamW(optimizer_grouped_parameters, lr=params.learning_rate, correct_bias=False)
    train_steps_per_epoch = params.train_size // params.batch_size
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=train_steps_per_epoch, num_training_steps=params.epoch_num * train_steps_per_epoch)

    # Train and evaluate the model
    logging.info("Starting training for {} epoch(s)".format(params.epoch_num))
    train_and_evaluate(model, train_data, val_data, optimizer, scheduler, params, tagger_model_dir, args.restore_dir)

示例#3

显示文件

文件： WCN_BERT_TFHD.py 项目： simplc/WCN-BERT

                                   lr=opt.lr,
                                   betas=(0.9, 0.999),
                                   eps=1e-8,
                                   weight_decay=opt.l2)
    elif opt.optim_choice.lower() == 'bertadam':
        num_train_optimization_steps = (
            len(train_dataloader.dataset) // opt.batchSize + 1) * opt.max_epoch
        opt.optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=opt.lr,
                                 warmup=opt.warmup_proportion,
                                 t_total=num_train_optimization_steps)
    elif opt.optim_choice.lower() == 'adamw':
        num_train_optimization_steps = (
            len(train_dataloader.dataset) // opt.batchSize + 1) * opt.max_epoch
        opt.optimizer = AdamW(optimizer_grouped_parameters,
                              lr=opt.lr,
                              correct_bias=False)
        opt.scheduler = get_linear_schedule_with_warmup(
            opt.optimizer,
            num_warmup_steps=int(opt.warmup_proportion *
                                 num_train_optimization_steps),
            num_training_steps=num_train_optimization_steps
        )  # PyTorch scheduler

    # loss functions
    opt.class_loss_function = nn.BCELoss(reduction='sum')
    opt.nll_loss_function = nn.NLLLoss(reduction='sum',
                                       ignore_index=Constants.PAD)

    # training or testing
    if opt.testing:

示例#4

显示文件

文件： train.RTE.batchMixup.py 项目： wyin-Salesforce/Mixup_for_FewShot_NLP

def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")

    parser.add_argument('--kshot',
                        type=int,
                        default=5,
                        help="random seed for initialization")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=1e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--beta_sampling_times',
                        type=int,
                        default=15,
                        help="random seed for initialization")
    parser.add_argument('--batch_mix_times',
                        type=int,
                        default=400,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")

    args = parser.parse_args()

    processors = {"rte": RteProcessor}

    output_modes = {"rte": "classification"}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    train_examples = processor.get_RTE_as_train_k_shot(
        '/export/home/Dataset/glue_data/RTE/train.tsv',
        args.kshot)  #train_pu_half_v1.txt
    dev_examples = processor.get_RTE_as_dev(
        '/export/home/Dataset/glue_data/RTE/dev.tsv')
    test_examples = processor.get_RTE_as_test(
        '/export/home/Dataset/RTE/test_RTE_1235.txt')
    label_list = ["entailment", "not_entailment"]
    num_labels = len(label_list)
    print('num_labels:', num_labels, 'training size:', len(train_examples),
          'dev size:', len(dev_examples), 'test size:', len(test_examples))

    num_train_optimization_steps = None
    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    model = RobertaForSequenceClassification(num_labels)
    tokenizer = RobertaTokenizer.from_pretrained(
        pretrain_model_dir, do_lower_case=args.do_lower_case)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    max_test_acc = 0.0
    max_dev_acc = 0.0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)
        '''load dev set'''
        dev_features = convert_examples_to_features(
            dev_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)

        dev_all_input_ids = torch.tensor([f.input_ids for f in dev_features],
                                         dtype=torch.long)
        dev_all_input_mask = torch.tensor([f.input_mask for f in dev_features],
                                          dtype=torch.long)
        dev_all_segment_ids = torch.tensor(
            [f.segment_ids for f in dev_features], dtype=torch.long)
        dev_all_label_ids = torch.tensor([f.label_id for f in dev_features],
                                         dtype=torch.long)

        dev_data = TensorDataset(dev_all_input_ids, dev_all_input_mask,
                                 dev_all_segment_ids, dev_all_label_ids)
        dev_sampler = SequentialSampler(dev_data)
        dev_dataloader = DataLoader(dev_data,
                                    sampler=dev_sampler,
                                    batch_size=args.eval_batch_size)
        '''load test set'''
        test_features = convert_examples_to_features(
            test_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)

        eval_all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                          dtype=torch.long)
        eval_all_input_mask = torch.tensor(
            [f.input_mask for f in test_features], dtype=torch.long)
        eval_all_segment_ids = torch.tensor(
            [f.segment_ids for f in test_features], dtype=torch.long)
        eval_all_label_ids = torch.tensor([f.label_id for f in test_features],
                                          dtype=torch.long)

        eval_data = TensorDataset(eval_all_input_ids, eval_all_input_mask,
                                  eval_all_segment_ids, eval_all_label_ids)
        eval_sampler = SequentialSampler(eval_data)
        test_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        train_sampler = RandomSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        iter_co = 0
        final_test_performance = 0.0
        for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                real_batch_size = input_ids.shape[0]
                '''use mixup???'''
                if epoch_i < 20:
                    '''pretraining'''
                    use_mixup = 'pretrain'
                    lambda_vec = torch.rand(args.batch_mix_times,
                                            real_batch_size).to(device)
                    lambda_matrix = nn.Softmax(dim=1)(
                        lambda_vec)  #(mix_time, batch_size)
                    logits = model(input_ids,
                                   input_mask,
                                   None,
                                   lambda_matrix,
                                   None,
                                   is_train=use_mixup)
                    loss_fct = CrossEntropyLoss(reduction='none')

                    mixup_logits = logits.view(-1,
                                               num_labels)  #(mixup_times, 2)
                    mixup_logits_repeat = tile(
                        mixup_logits, 0, real_batch_size
                    )  #torch.repeat_interleave(mixup_logits, repeats=real_batch_size, dim=0) #(mixup_times*batch_size, 2)
                    label_id_repeat = label_ids.view(-1).repeat(
                        args.batch_mix_times
                    )  #(0,1,2,..batch, 0, 1,2,3...batch)
                    mixup_loss_repeat = loss_fct(
                        mixup_logits_repeat.view(-1, num_labels),
                        label_id_repeat.view(-1))
                    mixup_loss = torch.sum(mixup_loss_repeat.view(
                        args.batch_mix_times, real_batch_size) * lambda_matrix,
                                           dim=1)  #(mixup_time)
                    loss = mixup_loss.mean()
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()
                else:
                    '''fine-tuning'''
                    use_mixup = 'finetune'
                    for _ in range(args.beta_sampling_times):
                        lambda_vec = beta.rvs(0.4, 0.4, size=1)[0]
                        logits = model(input_ids,
                                       input_mask,
                                       label_ids,
                                       None,
                                       lambda_vec,
                                       is_train=use_mixup)
                        loss = logits

                        loss.backward()
                        optimizer.step()
                        optimizer.zero_grad()
            '''
            start evaluate on dev set after this epoch
            '''
            model.eval()

            for idd, dev_or_test_dataloader in enumerate(
                [dev_dataloader, test_dataloader]):

                if idd == 0:
                    logger.info("***** Running dev *****")
                    logger.info("  Num examples = %d", len(dev_examples))
                else:
                    logger.info("***** Running test *****")
                    logger.info("  Num examples = %d", len(test_examples))
                # logger.info("  Batch size = %d", args.eval_batch_size)

                eval_loss = 0
                nb_eval_steps = 0
                preds = []
                gold_label_ids = []
                # print('Evaluating...')
                for input_ids, input_mask, segment_ids, label_ids in dev_or_test_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)
                    gold_label_ids += list(label_ids.detach().cpu().numpy())

                    with torch.no_grad():
                        logits = model(input_ids,
                                       input_mask,
                                       None,
                                       None,
                                       None,
                                       is_train='test')
                    if len(preds) == 0:
                        preds.append(logits.detach().cpu().numpy())
                    else:
                        preds[0] = np.append(preds[0],
                                             logits.detach().cpu().numpy(),
                                             axis=0)

                preds = preds[0]

                pred_probs = softmax(preds, axis=1)
                pred_label_ids = list(np.argmax(pred_probs, axis=1))

                gold_label_ids = gold_label_ids
                assert len(pred_label_ids) == len(gold_label_ids)
                hit_co = 0
                for k in range(len(pred_label_ids)):
                    if pred_label_ids[k] == gold_label_ids[k]:
                        hit_co += 1
                test_acc = hit_co / len(gold_label_ids)

                if idd == 0:  # this is dev
                    if test_acc > max_dev_acc:
                        max_dev_acc = test_acc
                        print('\ndev acc:', test_acc, ' max_dev_acc:',
                              max_dev_acc, '\n')
                    else:
                        print('\ndev acc:', test_acc, ' max_dev_acc:',
                              max_dev_acc, '\n')
                        break
                else:  # this is test
                    if test_acc > max_test_acc:
                        max_test_acc = test_acc

                    final_test_performance = test_acc
                    print('\ntest acc:', test_acc, ' max_test_acc:',
                          max_test_acc, '\n')
        print('final_test_performance:', final_test_performance)