def __init__( self, task: str = None, load_checkpoint: str = None, label_map: Dict = None, num_classes: int = 2, **kwargs, ): super(Electra, self).__init__() if label_map: self.label_map = label_map self.num_classes = len(label_map) else: self.num_classes = num_classes if task == 'sequence_classification': task = 'seq-cls' logger.warning( "current task name 'sequence_classification' was renamed to 'seq-cls', " "'sequence_classification' has been deprecated and will be removed in the future.", ) if task == 'seq-cls': self.model = ElectraForSequenceClassification.from_pretrained( pretrained_model_name_or_path='electra-small', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy() elif task == 'token-cls': self.model = ElectraForTokenClassification.from_pretrained( pretrained_model_name_or_path='electra-small', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = ChunkEvaluator(label_list=[ self.label_map[i] for i in sorted(self.label_map.keys()) ]) elif task == 'text-matching': self.model = ElectraModel.from_pretrained( pretrained_model_name_or_path='electra-small', **kwargs) self.dropout = paddle.nn.Dropout(0.1) self.classifier = paddle.nn.Linear( self.model.config['hidden_size'] * 3, 2) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy() elif task is None: self.model = ElectraModel.from_pretrained( pretrained_model_name_or_path='electra-small', **kwargs) else: raise RuntimeError( "Unknown task {}, task should be one in {}".format( task, self._tasks_supported)) self.task = task if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.map(trans_func) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def evaluate(args): paddle.set_device(args.device) # create dataset. test_ds = load_dataset(datafiles=(os.path.join(args.data_dir, 'test.tsv'))) word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic')) label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic')) # q2b.dic is used to replace DBC case to SBC case normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic')) trans_func = partial( convert_example, max_seq_len=args.max_seq_len, word_vocab=word_vocab, label_vocab=label_vocab, normlize_vocab=normlize_vocab) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0, dtype='int64'), # word_ids Stack(dtype='int64'), # length Pad(axis=0, pad_val=0, dtype='int64'), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler( dataset=test_ds, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab)) chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) # Load the model and start predicting model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) model.eval() chunk_evaluator.reset() for batch in test_loader: token_ids, length, labels = batch preds = model(token_ids, length) num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_evaluator.compute( length, preds, labels) chunk_evaluator.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = chunk_evaluator.accumulate() print("eval precision: %f, recall: %f, f1: %f" % (precision, recall, f1_score))
def evaluate(args): place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() paddle.set_device("gpu" if args.use_gpu else "cpu") # create dataset. test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size, test_dataset.num_labels) inputs = InputSpec(shape=(-1, ), dtype="int16", name='inputs') lengths = InputSpec(shape=(-1, ), dtype="int16", name='lengths') model = paddle.Model(network, inputs=[inputs, lengths]) chunk_evaluator = ChunkEvaluator( label_list=test_dataset.label_vocab.keys(), suffix=True) model.prepare(None, None, chunk_evaluator) # Load the model and start predicting model.load(args.init_checkpoint) model.evaluate( eval_data=test_loader, batch_size=args.batch_size, log_freq=100, verbose=2, )
def evaluate(args): place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() paddle.set_device("gpu" if args.use_gpu else "cpu") # create dataset. test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size, test_dataset.num_labels) model = paddle.Model(network) chunk_evaluator = ChunkEvaluator( int(math.ceil((test_dataset.num_labels + 1) / 2.0)), "IOB") # + 1 for SOS and EOS model.prepare(None, None, chunk_evaluator) # Load the model and start predicting model.load(args.init_checkpoint) model.evaluate( eval_data=test_loader, batch_size=args.batch_size, log_freq=100, verbose=2, )
def test_ner_dataset(client): from paddlenlp.metrics import ChunkEvaluator from datasets import load_dataset import paddle dev_ds = load_dataset("msra_ner", split="test") import os if os.environ.get('https_proxy'): del os.environ['https_proxy'] if os.environ.get('http_proxy'): del os.environ['http_proxy'] print("Start infer...") metric = ChunkEvaluator( label_list=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']) idx = 0 batch_size = 32 max_len = len(dev_ds["tokens"]) - 1 while idx < max_len: end_idx = idx + batch_size if idx + batch_size < max_len else max_len data = dev_ds["tokens"][idx:end_idx] ret = client.predict(feed_dict={"tokens": data}) if ret.err_no != 0: raise ValueError("err_no", ret.err_no, "err_msg: ", ret.err_msg) # print("ret:", ret) if idx < batch_size * 2: print_ret(json.loads(ret.value[0]), data) # calculate metric preds = json.loads(ret.value[1]) label_list = dev_ds["ner_tags"][idx:end_idx] label_list = label_pad(label_list, preds) label_list = paddle.to_tensor(label_list) preds = paddle.to_tensor(preds) seq_len = [preds.shape[1]] * preds.shape[0] num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( paddle.to_tensor(seq_len), preds, label_list) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) idx += batch_size print(idx) res = metric.accumulate() print("acc: ", res)
def train(args): paddle.set_device(args.device) trainer_num = paddle.distributed.get_world_size() if trainer_num > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() # Create dataset. train_ds, test_ds = load_dataset( datafiles=(os.path.join(args.data_dir, 'train.tsv'), os.path.join(args.data_dir, 'test.tsv'))) word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic')) label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic')) # q2b.dic is used to replace DBC case to SBC case normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic')) trans_func = partial(convert_example, max_seq_len=args.max_seq_len, word_vocab=word_vocab, label_vocab=label_vocab, normlize_vocab=normlize_vocab) train_ds.map(trans_func) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=word_vocab.get("[PAD]", 0), dtype='int64' ), # word_ids Stack(dtype='int64'), # length Pad(axis=0, pad_val=label_vocab.get("O", 0), dtype='int64' ), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_sampler, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler(dataset=test_ds, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab), crf_lr=args.crf_lr) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) if args.init_checkpoint: if os.path.exists(args.init_checkpoint): logger.info("Init checkpoint from %s" % args.init_checkpoint) model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) else: logger.info("Cannot init checkpoint from %s which doesn't exist" % args.init_checkpoint) logger.info("Start training") # Start training global_step = 0 last_step = args.epochs * len(train_loader) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() max_f1_score = -1 for epoch in range(args.epochs): for step, batch in enumerate(train_loader): train_reader_cost += time.time() - reader_start global_step += 1 token_ids, length, label_ids = batch train_start = time.time() loss = model(token_ids, length, label_ids) avg_loss = paddle.mean(loss) train_run_cost += time.time() - train_start total_samples += args.batch_size if global_step % args.logging_steps == 0: logger.info( "global step %d / %d, loss: %f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, last_step, avg_loss, train_reader_cost / args.logging_steps, (train_reader_cost + train_run_cost) / args.logging_steps, total_samples / args.logging_steps, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 avg_loss.backward() optimizer.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if rank == 0: paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "model_%d.pdparams" % global_step)) logger.info("Save %d steps model." % (global_step)) if args.do_eval: precision, recall, f1_score = evaluate( model, chunk_evaluator, test_loader) if f1_score > max_f1_score: max_f1_score = f1_score paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "best_model.pdparams")) logger.info("Save best model.") reader_start = time.time()
def train(args): if args.use_gpu: place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) paddle.set_device("gpu") else: place = paddle.CPUPlace() paddle.set_device("cpu") # create dataset. train_dataset = LacDataset(args.data_dir, mode='train') test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader( dataset=train_dataset, batch_sampler=train_sampler, places=place, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler( dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_loader = paddle.io.DataLoader( dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size, train_dataset.num_labels) model = paddle.Model(network) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam( learning_rate=args.base_lr, parameters=model.parameters()) crf_loss = LinearChainCrfLoss(network.crf.transitions) chunk_evaluator = ChunkEvaluator( int(math.ceil((train_dataset.num_labels + 1) / 2.0)), "IOB") # + 1 for START and STOP model.prepare(optimizer, crf_loss, chunk_evaluator) if args.init_checkpoint: model.load(args.init_checkpoint) # Start training callback = paddle.callbacks.ProgBarLogger(log_freq=10, verbose=3) model.fit(train_data=train_loader, eval_data=test_loader, batch_size=args.batch_size, epochs=args.epochs, eval_freq=1, log_freq=10, save_dir=args.model_save_dir, save_freq=1, drop_last=True, shuffle=True, callbacks=callback)
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() raw_datasets = load_dataset(args.task_name) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) train_ds = raw_datasets['train'] column_names = train_ds.column_names label_list = train_ds.features['ner_tags'].feature.names label_num = len(label_list) batchify_fn = DataCollatorForTokenClassification(tokenizer=tokenizer) # Define the model netword and its loss model = AutoModelForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def tokenize_and_align_labels(examples, no_entity_id=0): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs test_ds = raw_datasets['test'] test_ds = test_ds.select(range(len(test_ds) - 1)) test_ds = test_ds.map(tokenize_and_align_labels, batched=True, remove_columns=column_names) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) if args.do_train: train_ds = train_ds.select(range(len(train_ds) - 1)) train_ds = train_ds.map(tokenize_and_align_labels, batched=True, remove_columns=column_names) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss() metric = ChunkEvaluator(label_list=label_list) global_step = 0 best_f1 = 0.0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 logits = model(batch['input_ids'], batch['token_type_ids']) loss = loss_fct(logits, batch['labels']) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: f1 = evaluate(model, loss_fct, metric, test_data_loader, label_num, "test") if f1 > best_f1: best_f1 = f1 output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: print("best_f1: ", best_f1) return print("best_f1: ", best_f1) if args.do_eval: eval_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = AutoModelForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=label_num) loss_fct = paddle.nn.loss.CrossEntropyLoss() metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): logits = model(batch["input_ids"], batch["token_type_ids"]) loss = loss_fct(logits, batch["labels"]) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( batch["length"], preds, batch["labels"]) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
# load dev data model_name = "skep_ernie_1.0_large_ch" label2id, id2label = load_dict(args.label_path) test_ds = load_dataset(read, data_path=args.test_path, lazy=False) tokenizer = SkepTokenizer.from_pretrained(model_name) trans_func = partial(convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=args.max_seq_len) test_ds = test_ds.map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), Stack(dtype="int64"), Pad(axis=0, pad_val=-1, dtype="int64") ): fn(samples) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn) # load model loaded_state_dict = paddle.load(args.model_path) model = SkepForTokenClassification.from_pretrained(model_name, num_classes=len(label2id)) model.load_dict(loaded_state_dict) metric = ChunkEvaluator(label2id.keys()) # evalute on dev data precision, recall, f1 = evaluate(model, test_loader, metric) print(f'evalution result: precision: {precision:.5f}, recall: {recall:.5f}, F1: {f1:.5f}')
def _dynabert_training(self, task_name, ofa_model, model, teacher_model, train_dataloader, eval_dataloader, width_mult_list, criterion, num_train_epochs, output_dir): metric = Accuracy() if task_name == "msra_ner": metric = ChunkEvaluator(label_list=self.train_dataset.label_list) @paddle.no_grad() def evaluate(model, criterion, data_loader, width_mult=1.0): model.eval() all_start_logits = [] all_end_logits = [] metric.reset() for batch in data_loader: if "cmrc2018" in task_name: input_ids, token_type_ids = batch['input_ids'], batch[ 'token_type_ids'] logits = model( input_ids, token_type_ids, attention_mask=[None, None]) if width_mult == 100: start_logits_tensor, end_logits_tensor = logits else: start_logits_tensor, end_logits_tensor = logits[0] for idx in range(start_logits_tensor.shape[0]): if len(all_start_logits) % 1000 == 0 and len( all_start_logits): logger.info("Processing example: %d" % len(all_start_logits)) all_start_logits.append(start_logits_tensor.numpy()[idx]) all_end_logits.append(end_logits_tensor.numpy()[idx]) else: input_ids, segment_ids, labels = batch['input_ids'], batch[ 'token_type_ids'], batch['labels'] logits = model( input_ids, segment_ids, attention_mask=[None, None]) if isinstance(logits, tuple): logits = logits[0] loss = criterion(logits, labels) if task_name == "msra_ner": preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( batch['seq_len'], preds, batch['labels']) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) else: correct = metric.compute(logits, labels) metric.update(correct) if "cmrc2018" in task_name: n_best_size = 20 max_answer_length = 50 all_predictions, _, _ = compute_prediction( self.eval_examples, self.eval_dataset, (all_start_logits, all_end_logits), False, n_best_size, max_answer_length) res = squad_evaluate( examples=[raw_data for raw_data in self.eval_examples], preds=all_predictions, is_whitespace_splited=False) if width_mult == 100: logger.info("teacher model, EM: %f, F1: %f" % (res['exact'], res['f1'])) else: logger.info("width_mult: %s, EM: %f, F1: %f, " % (str(width_mult), res['exact'], res['f1'])) res = res['exact'] else: res = metric.accumulate() # Teacher model's evaluation if task_name == "msra_ner": if width_mult == 100: logger.info( "teacher model, eval loss: %f, precision: %f, recall: %f, f1_score: %f" % (paddle.mean(loss).numpy(), res[0], res[1], res[2])) else: logger.info( "width_mult: %s, eval loss: %f, precision: %f, recall: %f, f1_score: %f" % (str(width_mult), paddle.mean(loss).numpy(), res[0], res[1], res[2])) res = res[2] else: if width_mult == 100: logger.info("teacher model, eval loss: %f, acc: %s, " % (loss.numpy(), res)) else: logger.info("width_mult: %s, eval loss: %f, acc: %s, " % (str(width_mult), loss.numpy(), res)) model.train() return res from paddleslim.nas.ofa import OFA, DistillConfig, utils global_step = 0 lambda_logit = 1.0 tic_train = time.time() best_acc = 0.0 acc = 0.0 logger.info("DynaBERT training starts. This period will cost some time.") for epoch in range(num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_dataloader): global_step += 1 if "cmrc2018" in task_name: input_ids, token_type_ids, start_positions, end_positions = batch[ 'input_ids'], batch['token_type_ids'], batch[ 'start_positions'], batch['end_positions'] else: input_ids, token_type_ids, labels = batch['input_ids'], batch[ 'token_type_ids'], batch['labels'] for width_mult in width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model( input_ids, token_type_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() if "cmrc2018" in task_name: logit_loss = (soft_cross_entropy(logits[0], teacher_logits[0].detach()) \ + \ soft_cross_entropy(logits[1], teacher_logits[1].detach()))/2 else: logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + lambda_logit * logit_loss loss.backward() self.optimizer.step() self.lr_scheduler.step() self.optimizer.clear_grad() if global_step % self.args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, self.args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if "cmrc2018" not in task_name and global_step % self.args.save_steps == 0: tic_eval = time.time() evaluate( teacher_model, criterion, eval_dataloader, width_mult=100) logger.info("eval done total : %s s" % (time.time() - tic_eval)) for idx, width_mult in enumerate(width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() acc = evaluate(ofa_model, criterion, eval_dataloader, width_mult) if acc > best_acc: best_acc = acc if paddle.distributed.get_rank() == 0: output_dir_width = os.path.join(output_dir, str(width_mult)) if not os.path.exists(output_dir_width): os.makedirs(output_dir_width) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir_width) logger.info("eval done total : %s s" % (time.time() - tic_eval)) if global_step > self.args.num_training_steps: if best_acc == 0.0: output_dir_width = os.path.join(output_dir, str(width_mult)) if not os.path.exists(output_dir_width): os.makedirs(output_dir_width) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir_width) logger.info("Best acc: %.4f" % (best_acc)) return ofa_model if "cmrc2018" in task_name: tic_eval = time.time() evaluate(teacher_model, criterion, eval_dataloader, width_mult=100) logger.info("eval done total : %s s" % (time.time() - tic_eval)) for idx, width_mult in enumerate(width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() acc = evaluate(ofa_model, criterion, eval_dataloader, width_mult) if acc > best_acc: best_acc = acc if paddle.distributed.get_rank() == 0: output_dir_width = os.path.join(output_dir, str(width_mult)) if not os.path.exists(output_dir_width): os.makedirs(output_dir_width) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir_width) logger.info("eval done total : %s s" % (time.time() - tic_eval)) logger.info("Best acc: %.4f" % (best_acc)) return ofa_model
class ErnieV2(nn.Layer): """ Ernie model """ def __init__( self, task: str = None, load_checkpoint: str = None, label_map: Dict = None, num_classes: int = 2, **kwargs, ): super(ErnieV2, self).__init__() if label_map: self.label_map = label_map self.num_classes = len(label_map) else: self.num_classes = num_classes if task == 'sequence_classification': task = 'seq-cls' logger.warning( "current task name 'sequence_classification' was renamed to 'seq-cls', " "'sequence_classification' has been deprecated and will be removed in the future.", ) if task == 'seq-cls': self.model = ErnieForSequenceClassification.from_pretrained( pretrained_model_name_or_path='ernie-2.0-en', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy() elif task == 'token-cls': self.model = ErnieForTokenClassification.from_pretrained( pretrained_model_name_or_path='ernie-2.0-en', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = ChunkEvaluator(label_list=[ self.label_map[i] for i in sorted(self.label_map.keys()) ]) elif task is None: self.model = ErnieModel.from_pretrained( pretrained_model_name_or_path='ernie-2.0-en', **kwargs) else: raise RuntimeError( "Unknown task {}, task should be one in {}".format( task, self._tasks_supported)) self.task = task if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) return probs, loss, {'acc': acc} return probs elif self.task == 'token-cls': logits = result token_level_probs = F.softmax(logits, axis=-1) preds = token_level_probs.argmax(axis=-1) if labels is not None: loss = self.criterion(logits, labels.unsqueeze(-1)) num_infer_chunks, num_label_chunks, num_correct_chunks = \ self.metric.compute(None, seq_lengths, preds, labels) self.metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output @staticmethod def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. """ return ErnieTokenizer.from_pretrained( pretrained_model_name_or_path='ernie-2.0-en', *args, **kwargs)
def do_train(args): set_seed(args) tokenizer_class, eval_name, test_name, = DATASET_INFO[args.dataset] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) train_ds, eval_ds, test_ds = load_dataset( args.dataset, splits=["train", eval_name, test_name]) num_classes = len(train_ds.label_list) no_entity_id = num_classes - 1 paddle.set_device(args.device) trainer_num = paddle.distributed.get_world_size() if trainer_num > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() if rank == 0: if os.path.exists(args.model_name_or_path): logger.info("init checkpoint from %s" % args.model_name_or_path) model = ErnieDocForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) model_config = model.ernie_doc.config if trainer_num > 1: model = paddle.DataParallel(model) train_ds_iter = SequenceLabelingIterator( train_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, random_seed=args.seed, no_entity_id=no_entity_id) eval_ds_iter = SequenceLabelingIterator( eval_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, mode="eval", no_entity_id=no_entity_id) test_ds_iter = SequenceLabelingIterator( test_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, mode="test", no_entity_id=no_entity_id) train_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device()) eval_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) eval_dataloader.set_batch_generator(eval_ds_iter, paddle.get_device()) test_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) test_dataloader.set_batch_generator(test_ds_iter, paddle.get_device()) num_training_examples = train_ds_iter.get_num_examples() num_training_steps = args.epochs * num_training_examples // args.batch_size // trainer_num logger.info("Device count: %d, trainer_id: %d" % (trainer_num, rank)) logger.info("Num train examples: %d" % num_training_examples) logger.info("Max train steps: %d" % num_training_steps) logger.info("Num warmup steps: %d" % int(num_training_steps * args.warmup_proportion)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] # Construct dict name_dict = dict() for n, p in model.named_parameters(): name_dict[p.name] = n optimizer = AdamWDL(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, n_layers=model_config["num_hidden_layers"], layerwise_decay=args.layerwise_decay, name_dict=name_dict) criterion = paddle.nn.loss.CrossEntropyLoss() metric = ChunkEvaluator(label_list=train_ds.label_list) global_steps = 0 create_memory = partial(init_memory, args.batch_size, args.memory_length, model_config["hidden_size"], model_config["num_hidden_layers"]) # Copy the memory memories = create_memory() tic_train = time.time() best_f1 = 0 for epoch in range(args.epochs): train_ds_iter.shuffle_sample() train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device()) for step, batch in enumerate(train_dataloader, start=1): global_steps += 1 input_ids, position_ids, token_type_ids, attn_mask, labels, lengths, qids, \ gather_idx, need_cal_loss = batch logits, memories = model(input_ids, memories, token_type_ids, position_ids, attn_mask) logits, labels = list( map(lambda x: paddle.gather(x, gather_idx), [logits, labels])) loss = criterion(logits, labels) * need_cal_loss loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_steps % args.logging_steps == 0: logger.info( "train: global step %d, epoch: %d, loss: %f, lr: %f, speed: %.2f step/s" % (global_steps, epoch, loss, lr_scheduler.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_steps % args.save_steps == 0: # Evaluate logger.info("Eval:") precision, recall, f1_score = evaluate(model, metric, eval_dataloader, create_memory()) # Save if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_steps)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if f1_score > best_f1: logger.info("Save best model......") best_f1 = f1_score best_model_dir = os.path.join(args.output_dir, "best_model") if not os.path.exists(best_model_dir): os.makedirs(best_model_dir) model_to_save.save_pretrained(best_model_dir) tokenizer.save_pretrained(best_model_dir) if args.max_steps > 0 and global_steps >= args.max_steps: return logger.info("Final test result:") eval_acc = evaluate(model, metric, test_dataloader, create_memory())
def do_train(): paddle.set_device(args.device) set_seed(args.seed) train_ds, test_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) model = FasterErnieForTokenClassification.from_pretrained( "ernie-1.0", num_classes=len(train_ds.label_list), max_seq_len=args.max_seq_length, is_split_into_words=True) # ['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O'] label_num = len(train_ds.label_list) # the label 'O' index no_entity_id = label_num - 1 # ignore_label is for the label padding ignore_label = -100 trans_func = partial(batchify_fn, no_entity_id=no_entity_id, ignore_label=ignore_label, max_seq_len=args.max_seq_length) train_data_loader = DataLoader(dataset=train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=trans_func, return_list=True) test_data_loader = DataLoader(dataset=test_ds, batch_size=args.batch_size, collate_fn=trans_func, return_list=True) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=train_ds.label_list) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) global_step = 0 tic_train = time.time() for epoch in range(args.epochs): for step, (texts, labels, seq_lens) in enumerate(train_data_loader, start=1): texts = to_tensor(texts) global_step += 1 with paddle.amp.auto_cast( args.use_amp, custom_white_list=["fused_feedforward", "fused_attention"]): logits, preds = model(texts) loss = criterion(logits, labels) avg_loss = paddle.mean(loss) if global_step % 10 == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, 10 / (time.time() - tic_train))) tic_train = time.time() if args.use_amp: scaler.scale(avg_loss).backward() scaler.minimize(optimizer, avg_loss) else: avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % 500 == 0 or global_step == num_training_steps: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) evaluate(model, criterion, metric, test_data_loader, label_num) model.save_pretrained(save_dir)
class Electra(nn.Layer): """ Electra model """ def __init__( self, task: str = None, load_checkpoint: str = None, label_map: Dict = None, num_classes: int = 2, **kwargs, ): super(Electra, self).__init__() if label_map: self.label_map = label_map self.num_classes = len(label_map) else: self.num_classes = num_classes if task == 'sequence_classification': task = 'seq-cls' logger.warning( "current task name 'sequence_classification' was renamed to 'seq-cls', " "'sequence_classification' has been deprecated and will be removed in the future.", ) if task == 'seq-cls': self.model = ElectraForSequenceClassification.from_pretrained( pretrained_model_name_or_path='electra-small', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy() elif task == 'token-cls': self.model = ElectraForTokenClassification.from_pretrained( pretrained_model_name_or_path='electra-small', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = ChunkEvaluator(label_list=[ self.label_map[i] for i in sorted(self.label_map.keys()) ]) elif task == 'text-matching': self.model = ElectraModel.from_pretrained( pretrained_model_name_or_path='electra-small', **kwargs) self.dropout = paddle.nn.Dropout(0.1) self.classifier = paddle.nn.Linear( self.model.config['hidden_size'] * 3, 2) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy() elif task is None: self.model = ElectraModel.from_pretrained( pretrained_model_name_or_path='electra-small', **kwargs) else: raise RuntimeError( "Unknown task {}, task should be one in {}".format( task, self._tasks_supported)) self.task = task if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None, query_input_ids=None, query_token_type_ids=None, query_position_ids=None, query_attention_mask=None, title_input_ids=None, title_token_type_ids=None, title_position_ids=None, title_attention_mask=None, seq_lengths=None, labels=None): if self.task != 'text-matching': result = self.model(input_ids, token_type_ids, position_ids, attention_mask) else: query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) return probs, loss, {'acc': acc} return probs elif self.task == 'token-cls': logits = result token_level_probs = F.softmax(logits, axis=-1) preds = token_level_probs.argmax(axis=-1) if labels is not None: loss = self.criterion(logits, labels.unsqueeze(-1)) num_infer_chunks, num_label_chunks, num_correct_chunks = \ self.metric.compute(None, seq_lengths, preds, labels) self.metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs elif self.task == 'text-matching': query_token_embedding = query_result query_token_embedding = self.dropout(query_token_embedding) query_attention_mask = paddle.unsqueeze( (query_input_ids != self.model.pad_token_id).astype( query_token_embedding.dtype), axis=2) query_token_embedding = query_token_embedding * query_attention_mask query_sum_embedding = paddle.sum(query_token_embedding, axis=1) query_sum_mask = paddle.sum(query_attention_mask, axis=1) query_mean = query_sum_embedding / query_sum_mask title_token_embedding = title_result title_token_embedding = self.dropout(title_token_embedding) title_attention_mask = paddle.unsqueeze( (title_input_ids != self.model.pad_token_id).astype( title_token_embedding.dtype), axis=2) title_token_embedding = title_token_embedding * title_attention_mask title_sum_embedding = paddle.sum(title_token_embedding, axis=1) title_sum_mask = paddle.sum(title_attention_mask, axis=1) title_mean = title_sum_embedding / title_sum_mask sub = paddle.abs(paddle.subtract(query_mean, title_mean)) projection = paddle.concat([query_mean, title_mean, sub], axis=-1) logits = self.classifier(projection) probs = F.softmax(logits) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) return probs, loss, {'acc': acc} return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output @staticmethod def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. """ return ElectraTokenizer.from_pretrained( pretrained_model_name_or_path='electra-small', *args, **kwargs)
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() train_dataset, dev_dataset = ppnlp.datasets.MSRA_NER.get_datasets( ["train", "dev"]) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_dataset.get_labels() label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(convert_example, tokenizer=tokenizer, label_list=label_list, no_entity_id=label_num - 1, max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) ignore_label = -100 batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # length Pad(axis=0, pad_val=ignore_label) # label ): fn(samples) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps if args.max_steps > 0 else (len(train_data_loader) * args.num_train_epochs): float( current_step) / float(max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(int(math.ceil((label_num + 1) / 2.0)), "IOB") global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, length, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits.reshape([-1, label_num]), labels.reshape([-1])) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0: evaluate(model, loss_fct, metric, dev_data_loader, label_num) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) global_step += 1
collate_fn=batchify_fn) test_loader = paddle.io.DataLoader( dataset=test_ds, batch_size=200, drop_last=True, return_list=True, collate_fn=batchify_fn) network = BiGRUWithCRF(300, 300, train_ds.word_num, train_ds.label_num) model = paddle.Model(network) optimizer = paddle.optimizer.Adam( learning_rate=0.001, parameters=model.parameters()) crf_loss = LinearChainCrfLoss(network.crf) chunk_evaluator = ChunkEvaluator( label_list=train_ds.label_vocab.keys(), suffix=True) model.prepare(optimizer, crf_loss, chunk_evaluator) model.fit(train_data=train_loader, eval_data=dev_loader, epochs=10, save_dir='./results', log_freq=1) model.evaluate(eval_data=test_loader) outputs, lens, decodes = model.predict(test_data=test_loader) preds = parse_decodes(test_ds, decodes, lens) print('\n'.join(preds[:10]))
def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', split=('train', 'test')) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.select(range(len(eval_ds) - 1)) eval_ds = eval_ds.map(tokenize_and_align_labels, batched=True) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def train(args): paddle.set_device(args.device) # Create dataset. train_ds, test_ds = load_dataset( datafiles=(os.path.join(args.data_dir, 'train.tsv'), os.path.join(args.data_dir, 'test.tsv'))) word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic')) label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic')) # q2b.dic is used to replace DBC case to SBC case normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic')) trans_func = partial(convert_example, max_seq_len=args.max_seq_len, word_vocab=word_vocab, label_vocab=label_vocab, normlize_vocab=normlize_vocab) train_ds.map(trans_func) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0, dtype='int64'), # word_ids Stack(dtype='int64'), # length Pad(axis=0, pad_val=0, dtype='int64'), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_sampler, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler(dataset=test_ds, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab)) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) if args.init_checkpoint: model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) # Start training global_step = 0 last_step = args.epochs * len(train_loader) tic_train = time.time() for epoch in range(args.epochs): for step, batch in enumerate(train_loader): global_step += 1 token_ids, length, label_ids = batch loss = model(token_ids, length, label_ids) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print("global step %d / %d, loss: %f, speed: %.2f step/s" % (global_step, last_step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if paddle.distributed.get_rank() == 0: evaluate(model, chunk_evaluator, test_loader) paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "model_%d.pdparams" % global_step))
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() train_ds, test_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_ds = train_ds.map(trans_func) ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id ), # segment 'seq_len': Stack(), # seq_len 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, _, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits.reshape([-1, label_num]), labels.reshape([-1])) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, loss_fct, metric, test_data_loader, label_num) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step))
model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=args.learning_rate, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) metric = ChunkEvaluator(label_list=train_ds.label_list, suffix=True) global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, seq_lens, labels = batch loss = model( input_ids, token_type_ids, seq_lens=seq_lens, labels=labels) avg_loss = paddle.mean(loss) global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, 10 / (time.time() - tic_train)))
def predict(self, dataset, tokenizer, batchify_fn, args, dev_example=None, dev_ds_ori=None): if args.collect_shape: self.set_dynamic_shape(args.max_seq_length, args.batch_size) if args.task_name == "cmrc2018": dataset_removed = dataset.remove_columns( ["offset_mapping", "attention_mask", "example_id"]) sample_num = len(dataset) batches = [] for i in range(0, sample_num, args.batch_size): batch_size = min(args.batch_size, sample_num - i) batch = [dataset_removed[i + j] for j in range(batch_size)] batches.append(batch) else: sample_num = len(dataset) batches = [] for i in range(0, sample_num, args.batch_size): batch_size = min(args.batch_size, sample_num - i) batch = [dataset[i + j] for j in range(batch_size)] batches.append(batch) if args.perf: for i, batch in enumerate(batches): batch = batchify_fn(batch) input_ids, segment_ids = batch["input_ids"].numpy( ), batch["token_type_ids"].numpy() output = self.predict_batch([input_ids, segment_ids]) if i > args.perf_warmup_steps: break time1 = time.time() nums = 0 for batch in batches: batch = batchify_fn(batch) input_ids, segment_ids = batch["input_ids"].numpy( ), batch["token_type_ids"].numpy() nums = nums + input_ids.shape[0] output = self.predict_batch([input_ids, segment_ids]) total_time = time.time() - time1 print("task name: %s, sample nums: %s, time: %s, QPS: %s " % (args.task_name, nums, total_time, nums / total_time)) else: if args.task_name == "msra_ner": metric = ChunkEvaluator(label_list=args.label_list) metric.reset() all_predictions = [] batch_num = len(dataset['input_ids']) for batch in batches: batch = batchify_fn(batch) input_ids, segment_ids = batch["input_ids"].numpy( ), batch["token_type_ids"].numpy() output = self.predict_batch([input_ids, segment_ids])[0] preds = np.argmax(output, axis=2) all_predictions.append(preds.tolist()) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( batch["seq_len"], paddle.to_tensor(preds), batch["labels"]) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) res = metric.accumulate() print("task name: %s, (precision, recall, f1): %s, " % (args.task_name, res)) elif args.task_name == "cmrc2018": all_start_logits = [] all_end_logits = [] for batch in batches: batch = batchify_fn(batch) input_ids, segment_ids = batch["input_ids"].numpy( ), batch["token_type_ids"].numpy() start_logits, end_logits = self.predict_batch( [input_ids, segment_ids]) for idx in range(start_logits.shape[0]): if len(all_start_logits) % 1000 == 0 and len( all_start_logits): print("Processing example: %d" % len(all_start_logits)) all_start_logits.append(start_logits[idx]) all_end_logits.append(end_logits[idx]) all_predictions, _, _ = compute_prediction( dev_example, dataset, (all_start_logits, all_end_logits), False, args.n_best_size, args.max_answer_length) res = squad_evaluate( examples=[raw_data for raw_data in dev_example], preds=all_predictions, is_whitespace_splited=False) print("task name: %s, EM: %s, F1: %s" % (args.task_name, res['exact'], res['f1'])) return all_predictions else: all_predictions = [] metric = METRIC_CLASSES[args.task_name]() metric.reset() for i, batch in enumerate(batches): batch = batchify_fn(batch) output = self.predict_batch([ batch["input_ids"].numpy(), batch["token_type_ids"].numpy() ])[0] preds = np.argmax(output, axis=1) all_predictions.append(preds.tolist()) correct = metric.compute(paddle.to_tensor(output), batch["labels"]) metric.update(correct) res = metric.accumulate() print("task name: %s, acc: %s, " % (args.task_name, res)) return all_predictions
collate_fn=batchify_fn) dev_loader = paddle.io.DataLoader(dataset=dev_ds, batch_size=args.batch_size, return_list=True, collate_fn=batchify_fn) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_size=args.batch_size, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss ernie = ErnieForTokenClassification.from_pretrained( "ernie-1.0", num_classes=len(label_vocab)) model = ErnieCrfForTokenClassification(ernie) metric = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters()) step = 0 for epoch in range(args.epochs): for input_ids, token_type_ids, lengths, labels in train_loader: loss = model(input_ids, token_type_ids, lengths=lengths, labels=labels) avg_loss = paddle.mean(loss) avg_loss.backward() optimizer.step() optimizer.clear_grad() step += 1
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Create dataset, tokenizer and dataloader. if args.dataset == "peoples_daily_ner": raw_datasets = load_dataset(args.dataset) else: raw_datasets = load_dataset(args.dataset) AutoForTokenClassification, AutoTokenizer = MODEL_CLASSES[args.model_type] tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) train_ds = raw_datasets['train'] label_list = train_ds.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs train_ds = train_ds.select(range(len(train_ds) - 1)) column_names = train_ds.column_names train_ds = train_ds.map(tokenize_and_align_labels, batched=True, remove_columns=column_names) ignore_label = -100 batchify_fn = DataCollatorForTokenClassification(tokenizer, ignore_label) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = raw_datasets['test'] test_ds = test_ds.select(range(len(test_ds) - 1)) test_ds = test_ds.map(tokenize_and_align_labels, batched=True, remove_columns=column_names) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) if args.dataset == "peoples_daily_ner": dev_ds = raw_datasets['validation'] dev_ds = dev_ds.select(range(len(dev_ds) - 1)) dev_ds = dev_ds.map(tokenize_and_align_labels, batched=True, remove_columns=column_names) dev_data_loader = DataLoader(dataset=dev_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = AutoForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 logits = model(batch['input_ids'], batch['token_type_ids']) loss = loss_fct(logits, batch['labels']) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: if args.dataset == "peoples_daily_ner": evaluate(model, loss_fct, metric, dev_data_loader, label_num, "valid") evaluate(model, loss_fct, metric, test_data_loader, label_num, "test") paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) if global_step >= num_training_steps: return
def do_train(): paddle.set_device(args.device) world_size = paddle.distributed.get_world_size() rank = paddle.distributed.get_rank() if world_size > 1: paddle.distributed.init_parallel_env() set_seed(args) no_entity_label = "O" ignore_label = -1 tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map)) model = paddle.DataParallel(model) print("============start train==========") train_ds = DuEventExtraction(args.train_data, args.tag_path) dev_ds = DuEventExtraction(args.dev_data, args.tag_path) test_ds = DuEventExtraction(args.test_data, args.tag_path) trans_func = partial( convert_example_to_feature, tokenizer=tokenizer, label_vocab=train_ds.label_vocab, max_seq_len=args.max_seq_len, no_entity_label=no_entity_label, ignore_label=ignore_label, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input ids Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token type ids Stack(dtype='int64'), # sequence lens Pad(axis=0, pad_val=ignore_label, dtype='int64') # labels ): fn(list(map(trans_func, samples))) batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True) train_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=batch_sampler, collate_fn=batchify_fn) dev_loader = paddle.io.DataLoader( dataset=dev_ds, batch_size=args.batch_size, collate_fn=batchify_fn) test_loader = paddle.io.DataLoader( dataset=test_ds, batch_size=args.batch_size, collate_fn=batchify_fn) num_training_steps = len(train_loader) * args.num_epoch # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=args.learning_rate, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) metric = ChunkEvaluator(label_list=train_ds.label_vocab.keys(), suffix=False) criterion = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) step, best_f1 = 0, 0.0 model.train() for epoch in range(args.num_epoch): for idx, (input_ids, token_type_ids, seq_lens, labels) in enumerate(train_loader): logits = model(input_ids, token_type_ids).reshape( [-1, train_ds.label_num]) loss = paddle.mean(criterion(logits, labels.reshape([-1]))) loss.backward() optimizer.step() optimizer.clear_grad() loss_item = loss.numpy().item() if step > 0 and step % args.skip_step == 0 and rank == 0: print(f'train epoch: {epoch} - step: {step} (total: {num_training_steps}) - loss: {loss_item:.6f}') if step > 0 and step % args.valid_step == 0 and rank == 0: p, r, f1, avg_loss = evaluate(model, criterion, metric, len(label_map), dev_loader) print(f'dev step: {step} - loss: {avg_loss:.5f}, precision: {p:.5f}, recall: {r:.5f}, ' \ f'f1: {f1:.5f} current best {best_f1:.5f}') if f1 > best_f1: best_f1 = f1 print(f'==============================================save best model ' \ f'best performerence {best_f1:5f}') paddle.save(model.state_dict(), '{}/best.pdparams'.format(args.checkpoints)) step += 1 # save the final model if rank == 0: paddle.save(model.state_dict(), '{}/final.pdparams'.format(args.checkpoints))
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Create dataset, tokenizer and dataloader. if args.dataset == "peoples_daily_ner": train_ds, dev_ds, test_ds = load_dataset( args.dataset, splits=('train', 'dev', 'test'), lazy=False) else: train_ds, test_ds = load_dataset( args.dataset, splits=('train', 'test'), lazy=False) AutoForTokenClassification, AutoTokenizer = MODEL_CLASSES[args.model_type] tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial( tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_ds = train_ds.map(trans_func) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'), # segment 'seq_len': Stack(dtype='int64'), # seq_len 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader( dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader( dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) if args.dataset == "peoples_daily_ner": dev_ds = dev_ds.map(trans_func) dev_data_loader = DataLoader( dataset=dev_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = AutoForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, _, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: if args.dataset == "peoples_daily_ner": evaluate(model, loss_fct, metric, dev_data_loader, label_num, "valid") evaluate(model, loss_fct, metric, test_data_loader, label_num, "test") paddle.save(model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) if global_step >= num_training_steps: return
def train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") # create dataset. train_dataset = LacDataset(args.data_dir, mode='train') test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(dataset=train_dataset, batch_sampler=train_sampler, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size, train_dataset.num_labels) model = paddle.Model(network) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) crf_loss = LinearChainCrfLoss(network.crf) chunk_evaluator = ChunkEvaluator( label_list=train_dataset.label_vocab.keys(), suffix=True) model.prepare(optimizer, crf_loss, chunk_evaluator) if args.init_checkpoint: model.load(args.init_checkpoint) # Start training callbacks = paddle.callbacks.ProgBarLogger( log_freq=10, verbose=3) if args.verbose else None model.fit(train_data=train_loader, eval_data=test_loader, batch_size=args.batch_size, epochs=args.epochs, eval_freq=1, log_freq=10, save_dir=args.model_save_dir, save_freq=1, shuffle=True, callbacks=callbacks)
def train(args): paddle.set_device(args.device) set_seed(102) trainer_num = paddle.distributed.get_world_size() if trainer_num > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() word_vocab, label_vocab, train_loader, test_loader = create_data_loader( args) # Define the model netword and its loss model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab), crf_lr=args.crf_lr) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) if args.init_checkpoint: if os.path.exists(args.init_checkpoint): logger.info("Init checkpoint from %s" % args.init_checkpoint) model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) else: logger.info("Cannot init checkpoint from %s which doesn't exist" % args.init_checkpoint) logger.info("Start training") # Start training global_step = 0 last_step = args.epochs * len(train_loader) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() max_f1_score = -1 for epoch in range(args.epochs): for step, batch in enumerate(train_loader): train_reader_cost += time.time() - reader_start global_step += 1 token_ids, length, label_ids = batch train_start = time.time() loss = model(token_ids, length, label_ids) avg_loss = paddle.mean(loss) train_run_cost += time.time() - train_start total_samples += args.batch_size if global_step % args.logging_steps == 0: logger.info( "global step %d / %d, loss: %f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, last_step, avg_loss, train_reader_cost / args.logging_steps, (train_reader_cost + train_run_cost) / args.logging_steps, total_samples / args.logging_steps, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 avg_loss.backward() optimizer.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if rank == 0: paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "model_%d.pdparams" % global_step)) logger.info("Save %d steps model." % (global_step)) if args.do_eval: precision, recall, f1_score = evaluate( model, chunk_evaluator, test_loader) if f1_score > max_f1_score: max_f1_score = f1_score paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "best_model.pdparams")) logger.info("Save best model.") reader_start = time.time()
drop_last=True, return_list=True, collate_fn=batchify_fn) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_size=200, drop_last=True, return_list=True, collate_fn=batchify_fn) network = BiGRUWithCRF(300, 300, train_ds.word_num, train_ds.label_num) model = paddle.Model(network) optimizer = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()) crf_loss = LinearChainCrfLoss(network.crf.transitions) chunk_evaluator = ChunkEvaluator((train_ds.label_num + 2) // 2, 'IOB') model.prepare(optimizer, crf_loss, chunk_evaluator) model.fit(train_data=train_loader, eval_data=dev_loader, epochs=10, save_dir='./results', log_freq=1) model.evaluate(eval_data=test_loader) outputs, lens, decodes = model.predict(test_data=test_loader) preds = parse_decodes(test_ds, decodes, lens) print('\n'.join(preds[:10]))
shuffle=True, return_list=True, collate_fn=batchify_fn) dev_loader = paddle.io.DataLoader(dataset=dev_ds, batch_size=200, return_list=True, collate_fn=batchify_fn) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_size=200, return_list=True, collate_fn=batchify_fn) model = ErnieForTokenClassification.from_pretrained( "ernie-1.0", num_classes=train_ds.label_num) metric = ChunkEvaluator((train_ds.label_num + 2) // 2, "IOB") loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters()) step = 0 for epoch in range(10): model.train() for idx, (input_ids, segment_ids, length, labels) in enumerate(train_loader): logits = model(input_ids, segment_ids).reshape([-1, train_ds.label_num]) loss = paddle.mean(loss_fn(logits, labels.reshape([-1]))) loss.backward() optimizer.step() optimizer.clear_gradients()