def init_roberta_var(args): if args.language == 'ch': tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained) else: tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained) model = RobertaForQuestionAnswering.from_pretrained( args.from_pretrained, num_classes=args.num_classes) map_fn = partial(map_fn_DuCheckList, args=args, tokenizer=tokenizer) dev_ds = RCInterpret().read(args.data_dir) dev_ds.map(map_fn, batched=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "offset_mapping": Pad(axis=0, pad_val=tokenizer.pad_token_id), "overflow_to_sample": Stack(dtype='int32'), }): fn(samples) dev_dataloader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) return model, tokenizer, dev_dataloader, dev_ds
def init_roberta_var(args): tokenizer = None if args.language == "ch": tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained) else: tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained) model = RobertaForSequenceClassification.from_pretrained( args.from_pretrained, hidden_dropout_prob=0, attention_probs_dropout_prob=0, dropout=0, num_labels=2, name='', return_inter_score=True) map_fn = partial(map_fn_senti, tokenizer=tokenizer, language=args.language) dev_ds = SentiData().read(os.path.join(args.data_dir, 'dev'), args.language) dev_ds.map(map_fn, batched=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id) }): fn(samples) dataloader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) return model, tokenizer, dataloader
def init_roberta_var(args): if args.language == 'ch': tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained) else: tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained) model = RobertaForQuestionAnswering.from_pretrained(args.from_pretrained) map_fn = functools.partial(map_fn_DuCheckList, args=args, tokenizer=tokenizer) dev_ds = RCInterpret().read(os.path.join(args.data_dir, 'dev')) #dev_ds = load_dataset('squad', splits='dev_v2', data_files=None) dev_ds.map(map_fn, batched=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_dataloader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) return model, tokenizer, dev_dataloader, dev_ds
def do_train(): """ This function is the main part of the fine-tunning process """ paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) if args.language == 'ch': train_ds, dev_ds = load_dataset("chnsenticorp", splits=["train", "dev"]) if args.base_model == 'roberta_base': tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext') model = RobertaForSequenceClassification.from_pretrained( 'roberta-wwm-ext', num_classes=2) elif args.base_model == 'roberta_large': tokenizer = RobertaTokenizer.from_pretrained( 'roberta-wwm-ext-large') model = RobertaForSequenceClassification.from_pretrained( 'roberta-wwm-ext-large', num_classes=2) else: train_ds, dev_ds = load_dataset('glue', "sst-2", splits=["train", "dev"]) #for English version, we load models from local machine if args.base_model == 'roberta_base': tokenizer = RobertaBPETokenizer.from_pretrained('roberta-base') model = RobertaForSequenceClassification.from_pretrained( 'roberta-base', num_classes=2) elif args.base_model == 'roberta_large': tokenizer = RobertaBPETokenizer.from_pretrained('roberta-large') model = RobertaForSequenceClassification.from_pretrained( 'roberta-large', num_classes=2) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, language=args.language) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() log_per_step = 100 if args.language == 'en' else 10 for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % log_per_step == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, log_per_step / (time.time() - tic_train)), flush=True) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % (log_per_step * 10) == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) evaluate(model, criterion, metric, dev_data_loader) model._layers.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir)
tokenized_examples[i]['answerable_label'] = 1 return tokenized_examples if __name__ == "__main__": args = get_args() log.debug('----------- Configuration Arguments -----------') for arg, value in sorted(six.iteritems(vars(args))): log.debug('%s: %s' % (arg, value)) log.debug('------------------------------------------------') if args.language == 'ch': tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained) else: tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained) model = RobertaForQuestionAnswering.from_pretrained( args.from_pretrained, num_classes=2) train_ds = DuReaderChecklist().read(args.train_data_dir) dev_ds = DuReaderChecklist().read(args.dev_data_dir) train_ds.map(map_fn_DuCheckList_finetune, batched=True) dev_ds.map(map_fn_DuCheckList_finetune, batched=True) log.debug('train set: %d' % len(train_ds)) log.debug('dev set: %d' % len(dev_ds)) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.bsz, shuffle=True) dev_batch_sample = paddle.io.DistributedBatchSampler(