def __init__(self, topk=(1, ), pos_label=1, name='acc_and_f1', *args, **kwargs): super(AccuracyAndF1, self).__init__(*args, **kwargs) self.topk = topk self.pos_label = pos_label self._name = name self.acc = Accuracy(self.topk, *args, **kwargs) self.precision = Precision(*args, **kwargs) self.recall = Recall(*args, **kwargs) self.reset()
def compute_metrics(p): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = paddle.to_tensor(preds) label = paddle.to_tensor(p.label_ids) probs = F.softmax(preds, axis=1) metric = Accuracy() metric.reset() result = metric.compute(preds, label) metric.update(result) accu = metric.accumulate() metric.reset() return {"accuracy": accu}
def main(cfg): paddle.seed(cfg.COMMON.seed) np.random.seed(cfg.COMMON.seed) net = build_classifier(cfg.CLASSIFIER) model = paddle.Model(net) FLOPs = paddle.flops(net, [1, 3, 32, 32], print_detail=False) lrs = build_lrscheduler(cfg.SCHEDULER) optim = build_optim(cfg.OPTIMIZER, parameters=net.parameters(), learning_rate=lrs) train_transforms, val_transforms = build_transform() train_set = Cifar100(cfg.COMMON.data_path, mode='train', transform=train_transforms) test_set = Cifar100(cfg.COMMON.data_path, mode='test', transform=val_transforms) vis_name = '/{}-{}'.format( cfg.CLASSIFIER.name, time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())) callbacks = [LRSchedulerC(), VisualDLC(cfg.COMMON.logdir + vis_name)] model.prepare(optim, CrossEntropyLoss(), Accuracy(topk=(1, 5))) model.fit( train_set, test_set, batch_size=cfg.COMMON.batch_size, epochs=cfg.COMMON.epochs, num_workers=cfg.COMMON.workers, verbose=cfg.COMMON.verbose, callbacks=callbacks, )
def main(_): transform = T.Compose([T.ToTensor(), T.Normalize(mean=0.5, std=0.5)]) train_img_path = [] train_label = [] train_dataset = MyDataset(image=train_img_path, lable=train_label, transform=transform) train_loader = paddle.io.DataLoader(train_dataset, places=paddle.CPUPlace(), batch_size=2, shuffle=True) model = resnet18(pretrained=True, num_classes=102, with_pool=True) model = paddle.Model(model) optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()) """Train or evaluates the model.""" if FLAGS.mode == 'train': model.prepare( optimizer=optim, loss=paddle.nn.MSELoss(), metric=Accuracy() # topk计算准确率的top个数,默认是1 ) model.fit( train_loader, epochs=2, verbose=1, ) model.evaluate(train_dataset, batch_size=2, verbose=1) model.save('inference_model', training=False) elif FLAGS.mode == 'eval_rollout': metadata = _read_metadata(FLAGS.data_path)
def test_static_multiple_gpus(self): device = set_device('gpu') im_shape = (-1, 1, 28, 28) batch_size = 128 inputs = [Input(im_shape, 'float32', 'image')] labels = [Input([None, 1], 'int64', 'label')] model = Model(LeNet(), inputs, labels) optim = fluid.optimizer.Momentum( learning_rate=0.001, momentum=.9, parameter_list=model.parameters()) model.prepare(optim, CrossEntropyLoss(), Accuracy()) train_dataset = MnistDataset(mode='train') val_dataset = MnistDataset(mode='test') test_dataset = MnistDataset(mode='test', return_label=False) cbk = paddle.callbacks.ProgBarLogger(50) model.fit(train_dataset, val_dataset, epochs=2, batch_size=batch_size, callbacks=cbk) eval_result = model.evaluate(val_dataset, batch_size=batch_size) output = model.predict( test_dataset, batch_size=batch_size, stack_outputs=True) np.testing.assert_equal(output[0].shape[0], len(test_dataset)) acc = compute_accuracy(output[0], val_dataset.labels) np.testing.assert_allclose(acc, eval_result['acc'])
def func_warn_or_error(self): with self.assertRaises(ValueError): paddle.callbacks.ReduceLROnPlateau(factor=2.0) # warning paddle.callbacks.ReduceLROnPlateau(mode='1', patience=3, verbose=1) transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = CustomMnist(mode='train', transform=transform) val_dataset = CustomMnist(mode='test', transform=transform) net = LeNet() optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=net.parameters()) inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] model = Model(net, inputs=inputs, labels=labels) model.prepare(optim, loss=CrossEntropyLoss(), metrics=[Accuracy()]) callbacks = paddle.callbacks.ReduceLROnPlateau(monitor='miou', patience=3, verbose=1) model.fit(train_dataset, val_dataset, batch_size=8, log_freq=1, save_freq=10, epochs=1, callbacks=[callbacks]) optim = paddle.optimizer.Adam( learning_rate=paddle.optimizer.lr.PiecewiseDecay([0.001, 0.0001], [5, 10]), parameters=net.parameters()) model.prepare(optim, loss=CrossEntropyLoss(), metrics=[Accuracy()]) callbacks = paddle.callbacks.ReduceLROnPlateau(monitor='acc', mode='max', patience=3, verbose=1, cooldown=1) model.fit(train_dataset, val_dataset, batch_size=8, log_freq=1, save_freq=10, epochs=3, callbacks=[callbacks])
def fit(self, dynamic, num_replicas=None, rank=None, num_iters=None): fluid.enable_dygraph(self.device) if dynamic else None seed = 333 paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) net = LeNet() optim_new = fluid.optimizer.Adam( learning_rate=0.001, parameter_list=net.parameters()) model = Model(net, inputs=self.inputs, labels=self.labels) model.prepare( optim_new, loss=CrossEntropyLoss(reduction="sum"), metrics=Accuracy()) model.fit(self.train_dataset, batch_size=64, shuffle=False) result = model.evaluate(self.val_dataset, batch_size=64) np.testing.assert_allclose(result['acc'], self.acc1) model.fit(self.train_dataset, batch_size=64, shuffle=False, num_iters=num_iters) result = model.evaluate( self.val_dataset, batch_size=64, num_iters=num_iters) train_sampler = DistributedBatchSampler( self.train_dataset, batch_size=64, shuffle=False, num_replicas=num_replicas, rank=rank) val_sampler = DistributedBatchSampler( self.val_dataset, batch_size=64, shuffle=False, num_replicas=num_replicas, rank=rank) train_loader = fluid.io.DataLoader( self.train_dataset, batch_sampler=train_sampler, places=self.device, return_list=True) val_loader = fluid.io.DataLoader( self.val_dataset, batch_sampler=val_sampler, places=self.device, return_list=True) model.fit(train_loader, val_loader) fluid.disable_dygraph() if dynamic else None
class AccuracyAndF1(Metric): """ Encapsulates Accuracy, Precision, Recall and F1 metric logic. """ def __init__(self, topk=(1, ), pos_label=1, name='acc_and_f1', *args, **kwargs): super(AccuracyAndF1, self).__init__(*args, **kwargs) self.topk = topk self.pos_label = pos_label self._name = name self.acc = Accuracy(self.topk, *args, **kwargs) self.precision = Precision(*args, **kwargs) self.recall = Recall(*args, **kwargs) self.reset() def compute(self, pred, label, *args): self.label = label self.preds_pos = paddle.nn.functional.softmax(pred)[:, self.pos_label] return self.acc.compute(pred, label) def update(self, correct, *args): self.acc.update(correct) self.precision.update(self.preds_pos, self.label) self.recall.update(self.preds_pos, self.label) def accumulate(self): acc = self.acc.accumulate() precision = self.precision.accumulate() recall = self.recall.accumulate() if precision == 0.0 or recall == 0.0: f1 = 0.0 else: # 1/f1 = 1/2 * (1/precision + 1/recall) f1 = (2 * precision * recall) / (precision + recall) return ( acc, precision, recall, f1, (acc + f1) / 2, ) def reset(self): self.acc.reset() self.precision.reset() self.recall.reset() self.label = None self.preds_pos = None def name(self): """ Return name of metric instance. """ return self._name
def main(args): print('download training data and load training data') train_dataset = MnistDataset(mode='train', ) val_dataset = MnistDataset(mode='test', ) test_dataset = MnistDataset(mode='test', return_label=False) im_shape = (-1, 1, 28, 28) batch_size = 64 inputs = [Input(im_shape, 'float32', 'image')] labels = [Input([None, 1], 'int64', 'label')] model = Model(LeNet(), inputs, labels) optim = paddle.optimizer.SGD(learning_rate=0.001) if args.bf16: optim = amp.bf16.decorate_bf16( optim, amp_lists=amp.bf16.AutoMixedPrecisionListsBF16( custom_bf16_list={ 'matmul_v2', 'pool2d', 'relu', 'scale', 'elementwise_add', 'reshape2', 'slice', 'reduce_mean', 'conv2d' }, )) # Configuration model model.prepare(optim, paddle.nn.CrossEntropyLoss(), Accuracy()) # Training model # if args.bf16: print('Training BF16') else: print('Training FP32') model.fit(train_dataset, epochs=2, batch_size=batch_size, verbose=1) eval_result = model.evaluate(val_dataset, batch_size=batch_size, verbose=1) output = model.predict( test_dataset, batch_size=batch_size, stack_outputs=True) np.testing.assert_equal(output[0].shape[0], len(test_dataset)) acc = compute_accuracy(output[0], val_dataset.labels) print("acc", acc) print("eval_result['acc']", eval_result['acc']) np.testing.assert_allclose(acc, eval_result['acc'])
def evaluate(self, dynamic): fluid.enable_dygraph(self.device) if dynamic else None model = Model(LeNet(), self.inputs, self.labels) model.prepare(metrics=Accuracy()) model.load(self.weight_path) result = model.evaluate(self.val_dataset, batch_size=64) np.testing.assert_allclose(result['acc'], self.acc1) sampler = DistributedBatchSampler( self.val_dataset, batch_size=64, shuffle=False) val_loader = fluid.io.DataLoader( self.val_dataset, batch_sampler=sampler, places=self.device, return_list=True) model.evaluate(val_loader) fluid.disable_dygraph() if dynamic else None
def func_reduce_lr_on_plateau(self): transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = CustomMnist(mode='train', transform=transform) val_dataset = CustomMnist(mode='test', transform=transform) net = LeNet() optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=net.parameters()) inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] model = Model(net, inputs=inputs, labels=labels) model.prepare(optim, loss=CrossEntropyLoss(), metrics=[Accuracy()]) callbacks = paddle.callbacks.ReduceLROnPlateau(patience=1, verbose=1, cooldown=1) model.fit(train_dataset, val_dataset, batch_size=8, log_freq=1, save_freq=10, epochs=10, callbacks=[callbacks])
def print_logs(args, step, logits, labels, loss, total_time, metric): if args.task_name in ['udc', 'atis_intent', 'mrda', 'swda']: if args.task_name == 'udc': metric = Accuracy() metric.reset() correct = metric.compute(logits, labels) metric.update(correct) acc = metric.accumulate() print('step %d - loss: %.4f - acc: %.4f - %.3fs/step' % (step, loss, acc, total_time / args.logging_steps)) elif args.task_name == 'dstc2': metric.reset() metric.update(logits, labels) joint_acc = metric.accumulate() print('step %d - loss: %.4f - joint_acc: %.4f - %.3fs/step' % (step, loss, joint_acc, total_time / args.logging_steps)) elif args.task_name == 'atis_slot': metric.reset() metric.update(logits, labels) f1_micro = metric.accumulate() print('step %d - loss: %.4f - f1_micro: %.4f - %.3fs/step' % (step, loss, f1_micro, total_time / args.logging_steps))
def evaluate(model, args, mode='test'): """evaluate the model""" model.eval() metric = Accuracy() eval_dataloader, processor = load_example(args, mode) for batch in tqdm(eval_dataloader, total=len(eval_dataloader)): logits = model(input_ids=batch[0], token_type_ids=batch[1]) labels = batch[2].reshape(( -1, 1, )) correct = metric.compute(logits, labels) metric.update(correct) res = metric.accumulate() print('Accuracy:', res) model.train() return res
parser.add_argument("--memory_length", type=int, default=128, help="Length of the retained previous heads.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Linear warmup proption over the training process.") parser.add_argument("--dataset", default="imdb", choices=["imdb", "iflytek", "thucnews", "hyp"], type=str, help="The training dataset") parser.add_argument("--layerwise_decay", default=1.0, type=float, help="Layerwise decay ratio") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.",) # yapf: enable args = parser.parse_args() # tokenizer, eval_dataset, test_dataset, preprocess_text_fn, metric # BPETokenizer for English Tasks # ErnieDocTokenizer for Chinese Tasks DATASET_INFO = { "imdb": (ErnieDocBPETokenizer, "test", "test", ImdbTextPreprocessor(), Accuracy()), "hyp": (ErnieDocBPETokenizer, "dev", "test", HYPTextPreprocessor(), F1()), "iflytek": (ErnieDocTokenizer, "dev", "dev", None, Accuracy()), "thucnews": (ErnieDocTokenizer, "dev", "test", None, Accuracy()) } def set_seed(args): # Use the same data seed(for data shuffle) for all procs to guarantee data # consistency after sharding. random.seed(args.seed) np.random.seed(args.seed) # Maybe different op seeds(for dropout) for different procs is better. By: # `paddle.seed(args.seed + paddle.distributed.get_rank())` paddle.seed(args.seed)
from paddle.vision.models import resnet50, vgg16, LeNet from paddle.vision.datasets import Cifar10 from paddle.optimizer import Momentum from paddle.regularizer import L2Decay from paddle.nn import CrossEntropyLoss from paddle.metric import Accuracy from paddle.vision.transforms import Transpose # 确保从paddle.vision.datasets.Cifar10中加载的图像数据是np.ndarray类型 paddle.vision.set_image_backend('cv2') # 调用resnet50模型 model = paddle.Model(resnet50(pretrained=False, num_classes=10)) # 使用Cifar10数据集 train_dataset = Cifar10(mode='train', transform=Transpose()) val_dataset = Cifar10(mode='test', transform=Transpose()) # 定义优化器 optimizer = Momentum(learning_rate=0.01, momentum=0.9, weight_decay=L2Decay(1e-4), parameters=model.parameters()) # 进行训练前准备 model.prepare(optimizer, CrossEntropyLoss(), Accuracy(topk=(1, 5))) # 启动训练 model.fit(train_dataset, val_dataset, epochs=50, batch_size=64, save_dir="./output", num_workers=8)
def test_earlystopping(self): paddle.seed(2020) for dynamic in [True, False]: paddle.enable_static if not dynamic else None device = paddle.set_device('cpu') sample_num = 100 train_dataset = MnistDataset(mode='train', sample_num=sample_num) val_dataset = MnistDataset(mode='test', sample_num=sample_num) net = LeNet() optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=net.parameters()) inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] model = Model(net, inputs=inputs, labels=labels) model.prepare(optim, loss=CrossEntropyLoss(reduction="sum"), metrics=[Accuracy()]) callbacks_0 = paddle.callbacks.EarlyStopping('loss', mode='min', patience=1, verbose=1, min_delta=0, baseline=None, save_best_model=True) callbacks_1 = paddle.callbacks.EarlyStopping('acc', mode='auto', patience=1, verbose=1, min_delta=0, baseline=0, save_best_model=True) callbacks_2 = paddle.callbacks.EarlyStopping('loss', mode='auto_', patience=1, verbose=1, min_delta=0, baseline=None, save_best_model=True) callbacks_3 = paddle.callbacks.EarlyStopping('acc_', mode='max', patience=1, verbose=1, min_delta=0, baseline=0, save_best_model=True) model.fit( train_dataset, val_dataset, batch_size=64, save_freq=10, save_dir=self.save_dir, epochs=10, verbose=0, callbacks=[callbacks_0, callbacks_1, callbacks_2, callbacks_3]) # Test for no val_loader model.fit(train_dataset, batch_size=64, save_freq=10, save_dir=self.save_dir, epochs=10, verbose=0, callbacks=[callbacks_0])
def run(args): paddle.set_device(args.device) set_seed(args) max_seq_length = args.max_seq_length max_num_choices = 10 def preprocess_function(examples, do_predict=False): SPIECE_UNDERLINE = '▁' def _is_chinese_char(cp): if ((cp >= 0x4E00 and cp <= 0x9FFF) or # (cp >= 0x3400 and cp <= 0x4DBF) or # (cp >= 0x20000 and cp <= 0x2A6DF) or # (cp >= 0x2A700 and cp <= 0x2B73F) or # (cp >= 0x2B740 and cp <= 0x2B81F) or # (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or # (cp >= 0x2F800 and cp <= 0x2FA1F)): # return True return False def is_fuhao(c): if c == '。' or c == ',' or c == '!' or c == '?' or c == ';' or c == '、' or c == ':' or c == '(' or c == ')' \ or c == '-' or c == '~' or c == '「' or c == '《' or c == '》' or c == ',' or c == '」' or c == '"' or c == '“' or c == '”' \ or c == '$' or c == '『' or c == '』' or c == '—' or c == ';' or c == '。' or c == '(' or c == ')' or c == '-' or c == '~' or c == '。' \ or c == '‘' or c == '’': return True return False def _tokenize_chinese_chars(text): """Adds whitespace around any CJK character.""" output = [] is_blank = False for index, char in enumerate(text): cp = ord(char) if is_blank: output.append(char) if context[index - 12:index + 1].startswith("#idiom"): is_blank = False output.append(SPIECE_UNDERLINE) else: if text[index:index + 6] == "#idiom": is_blank = True if len(output) > 0 and output[-1] != SPIECE_UNDERLINE: output.append(SPIECE_UNDERLINE) output.append(char) elif _is_chinese_char(cp) or is_fuhao(char): if len(output) > 0 and output[-1] != SPIECE_UNDERLINE: output.append(SPIECE_UNDERLINE) output.append(char) output.append(SPIECE_UNDERLINE) else: output.append(char) return "".join(output) def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F or c == SPIECE_UNDERLINE: return True return False def add_tokens_for_around(tokens, pos, num_tokens): num_l = num_tokens // 2 num_r = num_tokens - num_l if pos >= num_l and (len(tokens) - 1 - pos) >= num_r: tokens_l = tokens[pos - num_l:pos] tokens_r = tokens[pos + 1:pos + 1 + num_r] elif pos <= num_l: tokens_l = tokens[:pos] right_len = num_tokens - len(tokens_l) tokens_r = tokens[pos + 1:pos + 1 + right_len] elif (len(tokens) - 1 - pos) <= num_r: tokens_r = tokens[pos + 1:] left_len = num_tokens - len(tokens_r) tokens_l = tokens[pos - left_len:pos] else: raise ValueError('impossible') return tokens_l, tokens_r max_tokens_for_doc = max_seq_length - 3 num_tokens = max_tokens_for_doc - 5 num_examples = len(examples.data["candidates"]) if do_predict: result = {"input_ids": [], "token_type_ids": []} else: result = {"input_ids": [], "token_type_ids": [], "labels": []} for idx in range(num_examples): candidate = 0 options = examples.data['candidates'][idx] # Each content may have several sentences. for context in examples.data['content'][idx]: context = context.replace("“", "\"").replace("”", "\"").replace("——", "--"). \ replace("—", "-").replace("―", "-").replace("…", "...").replace("‘", "\'").replace("’", "\'") context = _tokenize_chinese_chars(context) paragraph_text = context.strip() doc_tokens = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False all_doc_tokens = [] for (i, token) in enumerate(doc_tokens): if '#idiom' in token: sub_tokens = [str(token)] else: sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: all_doc_tokens.append(sub_token) tags = [blank for blank in doc_tokens if '#idiom' in blank] # Each sentence may have several tags for tag_index, tag in enumerate(tags): pos = all_doc_tokens.index(tag) tmp_l, tmp_r = add_tokens_for_around(all_doc_tokens, pos, num_tokens) num_l = len(tmp_l) num_r = len(tmp_r) tokens_l = [] for token in tmp_l: if '#idiom' in token and token != tag: # Mask tag which is not considered in this new sample. # Each idiom has four words, so 4 mask tokens are used. tokens_l.extend(['[MASK]'] * 4) else: tokens_l.append(token) tokens_l = tokens_l[-num_l:] del tmp_l tokens_r = [] for token in tmp_r: if '#idiom' in token and token != tag: tokens_r.extend(['[MASK]'] * 4) else: tokens_r.append(token) tokens_r = tokens_r[:num_r] del tmp_r tokens_list = [] # Each tag has ten choices, and the shape of each new # example is [num_choices, seq_len] for i, elem in enumerate(options): option = tokenizer.tokenize(elem) tokens = option + ['[SEP]'] + tokens_l + ['[unused1]' ] + tokens_r tokens_list.append(tokens) new_data = tokenizer(tokens_list, is_split_into_words=True) # Final shape of input_ids: [batch_size, num_choices, seq_len] result["input_ids"].append(new_data["input_ids"]) result["token_type_ids"].append(new_data["token_type_ids"]) if not do_predict: label = examples.data["answers"][idx]["candidate_id"][ candidate] result["labels"].append(label) candidate += 1 if (idx + 1) % 10000 == 0: print(idx + 1, "samples have been processed.") return result if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, num_choices=max_num_choices) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) train_ds, dev_ds, test_ds = load_dataset( "clue", "chid", split=["train", "validation", "test"]) if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) column_names = train_ds.column_names train_ds = train_ds.map(partial(preprocess_function), batched=True, batch_size=len(train_ds), num_proc=1, remove_columns=column_names) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment 'labels': Stack(dtype="int64") # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = dev_ds.map(partial(preprocess_function), batched=True, batch_size=len(dev_ds), remove_columns=column_names, num_proc=1) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_data_loader = paddle.io.DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) num_training_steps = int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, 0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip) loss_fct = nn.CrossEntropyLoss() metric = Accuracy() model.train() global_step = 0 best_acc = 0.0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, labels = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = loss_fct(logits, labels) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, num_training_steps, epoch, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() tic_eval = time.time() acc = evaluate(model, loss_fct, metric, dev_data_loader) print("eval acc: %.5f, eval done total : %s s" % (acc, time.time() - tic_eval)) if paddle.distributed.get_rank() == 0 and acc > best_acc: best_acc = acc model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) print("best_acc: ", best_acc) if args.do_predict: column_names = test_ds.column_names test_ds = test_ds.map(partial( preprocess_function, do_predict=True), batched=True, batch_size=len(test_ds), remove_columns=column_names, num_proc=1) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.eval_batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment }): fn(samples) test_data_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, return_list=True) result = {} idx = 623377 for step, batch in enumerate(test_data_loader): input_ids, segment_ids = batch with paddle.no_grad(): logits = model(input_ids, segment_ids) preds = paddle.argmax(logits, axis=1).numpy().tolist() for pred in preds: result["#idiom" + str(idx)] = pred idx += 1 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open( os.path.join(args.output_dir, 'chid11_predict.json'), "w", encoding='utf-8') as writer: writer.write( json.dumps( result, ensure_ascii=False, indent=4) + "\n")
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) if args.task_type == "cross-lingual-transfer": train_ds = load_dataset("xnli", "en", splits="train") train_ds = train_ds.map(trans_func, lazy=True) elif args.task_type == "translate-train-all": all_train_ds = [] for language in all_languages: train_ds = load_dataset("xnli", language, splits="train") all_train_ds.append(train_ds.map(trans_func, lazy=True)) train_ds = XnliDataset(all_train_ds) train_batch_sampler = DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # position_ids Pad(axis=0, pad_val=0, dtype="int64"), # attention_mask Stack(dtype="int64") # labels ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 3 model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes, dropout=args.dropout) n_layers = model.ernie_m.config['num_hidden_layers'] if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] # Construct dict name_dict = dict() for n, p in model.named_parameters(): name_dict[p.name] = n optimizer = AdamWDL(learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, n_layers=n_layers, layerwise_decay=args.layerwise_decay, apply_decay_param_fun=lambda x: x in decay_params, name_dict=name_dict) loss_fct = nn.CrossEntropyLoss() if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) metric = Accuracy() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, position_ids, attention_mask, labels = batch with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"]): logits = model(input_ids, position_ids, attention_mask) loss = loss_fct(logits, labels) if args.use_amp: scaled_loss = scaler.scale(loss) scaled_loss.backward() scaler.minimize(optimizer, scaled_loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: for language in all_languages: tic_eval = time.time() test_data_loader = get_test_dataloader( args, language, batchify_fn, trans_func) evaluate(model, loss_fct, metric, test_data_loader, language) print("eval done total : %s s" % (time.time() - tic_eval)) if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ernie_m_ft_model_%d.pdparams" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: break if global_step >= num_training_steps: break if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ernie_m_final_model_%d.pdparams" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('thucnews', splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = load_dataset('thucnews', splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list) model = model_class.from_pretrained( args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = Accuracy() if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"]): logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ft_model_%d.pdparams" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: return
class AccuracyAndF1(Metric): """ This class encapsulates Accuracy, Precision, Recall and F1 metric logic, and `accumulate` function returns accuracy, precision, recall and f1. The overview of all metrics could be seen at the document of `paddle.metric <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/metric/Overview_cn.html>`_ for details. Args: topk (int or tuple(int), optional): Number of top elements to look at for computing accuracy. Defaults to (1,). pos_label (int, optional): The positive label for calculating precision and recall. Defaults to 1. name (str, optional): String name of the metric instance. Defaults to 'acc_and_f1'. Example: .. code-block:: import paddle from paddlenlp.metrics import AccuracyAndF1 x = paddle.to_tensor([[0.1, 0.9], [0.5, 0.5], [0.6, 0.4], [0.7, 0.3]]) y = paddle.to_tensor([[1], [0], [1], [1]]) m = AccuracyAndF1() correct = m.compute(x, y) m.update(correct) res = m.accumulate() print(res) # (0.5, 0.5, 0.3333333333333333, 0.4, 0.45) """ def __init__(self, topk=(1, ), pos_label=1, name='acc_and_f1', *args, **kwargs): super(AccuracyAndF1, self).__init__(*args, **kwargs) self.topk = topk self.pos_label = pos_label self._name = name self.acc = Accuracy(self.topk, *args, **kwargs) self.precision = Precision(*args, **kwargs) self.recall = Recall(*args, **kwargs) self.reset() def compute(self, pred, label, *args): """ Accepts network's output and the labels, and calculates the top-k (maximum value in topk) indices for accuracy. Args: pred (Tensor): Predicted tensor, and its dtype is float32 or float64, and has a shape of [batch_size, num_classes]. label (Tensor): The ground truth tensor, and its dtype is is int64, and has a shape of [batch_size, 1] or [batch_size, num_classes] in one hot representation. Returns: Tensor: Correct mask, each element indicates whether the prediction equals to the label. Its' a tensor with a data type of float32 and has a shape of [batch_size, topk]. """ self.label = label self.preds_pos = paddle.nn.functional.softmax(pred)[:, self.pos_label] return self.acc.compute(pred, label) def update(self, correct, *args): """ Updates the metrics states (accuracy, precision and recall), in order to calculate accumulated accuracy, precision and recall of all instances. Args: correct (Tensor): Correct mask for calculating accuracy, and it's a tensor with shape [batch_size, topk] and has a dtype of float32. """ self.acc.update(correct) self.precision.update(self.preds_pos, self.label) self.recall.update(self.preds_pos, self.label) def accumulate(self): """ Calculates and returns the accumulated metric. Returns: tuple: The accumulated metric. A tuple of shape (acc, precision, recall, f1, average_of_acc_and_f1) With the fileds: - acc (numpy.float64): The accumulated accuracy. - precision (numpy.float64): The accumulated precision. - recall (numpy.float64): The accumulated recall. - f1 (numpy.float64): The accumulated f1. - average_of_acc_and_f1 (numpy.float64): The average of accumulated accuracy and f1. """ acc = self.acc.accumulate() precision = self.precision.accumulate() recall = self.recall.accumulate() if precision == 0.0 or recall == 0.0: f1 = 0.0 else: # 1/f1 = 1/2 * (1/precision + 1/recall) f1 = (2 * precision * recall) / (precision + recall) return ( acc, precision, recall, f1, (acc + f1) / 2, ) def reset(self): """ Resets all metric states. """ self.acc.reset() self.precision.reset() self.recall.reset() self.label = None self.preds_pos = None def name(self): """ Returns name of the metric instance. Returns: str: The name of the metric instance. """ return self._name
def train(args): # 加载数据 trainset = IMDBDataset(is_training=True) testset = IMDBDataset(is_training=False) # 封装成MapDataSet的形式 train_ds = MapDataset(trainset, label_list=[0, 1]) test_ds = MapDataset(testset, label_list=[0, 1]) # 定义XLNet的Tokenizer tokenizer = XLNetTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) # 构造train_data_loader 和 dev_data_loader train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, pad_right=False), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, pad_right=False ), # token_type Pad(axis=0, pad_val=0, pad_right=False), # attention_mask Stack(dtype="int64" if train_ds.label_list else "float32"), # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = MapDataset(testset) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) # 训练配置 # 固定随机种子 set_seed(args) # 设定运行环境 use_gpu = True if paddle.get_device().startswith("gpu") else False if use_gpu: paddle.set_device('gpu:0') num_classes = len(train_ds.label_list) model = XLNetForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) #paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = paddle.DataParallel(model) # 设定lr_scheduler if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # 制定优化器 clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.max_grad_norm) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "layer_norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # 模型训练 metric = Accuracy() # 定义损失函数 loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() global_step = 0 tic_train = time.time() model.train() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, attention_mask, labels = batch logits = model(input_ids, token_type_ids, attention_mask) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if (not paddle.distributed.get_world_size() > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step == num_training_steps: exit(0) tic_train += time.time() - tic_eval
r, t = self.rnn0(x) x = paddle.concat([y1, y2, y3, r], axis=-1) x = self.rnn1(x) x = self.rnn2(x) x = paddle.flatten(x, start_axis=1) x = self.cls(x) return x if __name__ == '__main__': model = FallNet() model = paddle.Model(model) # model.summary((1,3,151)) # x = np.random.rand(1,3,151) # y = model.predict(x) # print(y.shape) from paddle.metric import Accuracy model.prepare(paddle.optimizer.Adam(0.0001, parameters=model.parameters()), paddle.nn.CrossEntropyLoss(), Accuracy()) # model.fit(train_dataset, # epochs=20, # batch_size=32, # verbose=1, # )
def _dynabert_training(self, task_name, ofa_model, model, teacher_model, train_dataloader, eval_dataloader, width_mult_list, criterion, num_train_epochs, output_dir): metric = Accuracy() if task_name == "msra_ner": metric = ChunkEvaluator(label_list=self.train_dataset.label_list) @paddle.no_grad() def evaluate(model, criterion, data_loader, width_mult=1.0): model.eval() all_start_logits = [] all_end_logits = [] metric.reset() for batch in data_loader: if "cmrc2018" in task_name: input_ids, token_type_ids = batch['input_ids'], batch[ 'token_type_ids'] logits = model( input_ids, token_type_ids, attention_mask=[None, None]) if width_mult == 100: start_logits_tensor, end_logits_tensor = logits else: start_logits_tensor, end_logits_tensor = logits[0] for idx in range(start_logits_tensor.shape[0]): if len(all_start_logits) % 1000 == 0 and len( all_start_logits): logger.info("Processing example: %d" % len(all_start_logits)) all_start_logits.append(start_logits_tensor.numpy()[idx]) all_end_logits.append(end_logits_tensor.numpy()[idx]) else: input_ids, segment_ids, labels = batch['input_ids'], batch[ 'token_type_ids'], batch['labels'] logits = model( input_ids, segment_ids, attention_mask=[None, None]) if isinstance(logits, tuple): logits = logits[0] loss = criterion(logits, labels) if task_name == "msra_ner": preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( batch['seq_len'], preds, batch['labels']) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) else: correct = metric.compute(logits, labels) metric.update(correct) if "cmrc2018" in task_name: n_best_size = 20 max_answer_length = 50 all_predictions, _, _ = compute_prediction( self.eval_examples, self.eval_dataset, (all_start_logits, all_end_logits), False, n_best_size, max_answer_length) res = squad_evaluate( examples=[raw_data for raw_data in self.eval_examples], preds=all_predictions, is_whitespace_splited=False) if width_mult == 100: logger.info("teacher model, EM: %f, F1: %f" % (res['exact'], res['f1'])) else: logger.info("width_mult: %s, EM: %f, F1: %f, " % (str(width_mult), res['exact'], res['f1'])) res = res['exact'] else: res = metric.accumulate() # Teacher model's evaluation if task_name == "msra_ner": if width_mult == 100: logger.info( "teacher model, eval loss: %f, precision: %f, recall: %f, f1_score: %f" % (paddle.mean(loss).numpy(), res[0], res[1], res[2])) else: logger.info( "width_mult: %s, eval loss: %f, precision: %f, recall: %f, f1_score: %f" % (str(width_mult), paddle.mean(loss).numpy(), res[0], res[1], res[2])) res = res[2] else: if width_mult == 100: logger.info("teacher model, eval loss: %f, acc: %s, " % (loss.numpy(), res)) else: logger.info("width_mult: %s, eval loss: %f, acc: %s, " % (str(width_mult), loss.numpy(), res)) model.train() return res from paddleslim.nas.ofa import OFA, DistillConfig, utils global_step = 0 lambda_logit = 1.0 tic_train = time.time() best_acc = 0.0 acc = 0.0 logger.info("DynaBERT training starts. This period will cost some time.") for epoch in range(num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_dataloader): global_step += 1 if "cmrc2018" in task_name: input_ids, token_type_ids, start_positions, end_positions = batch[ 'input_ids'], batch['token_type_ids'], batch[ 'start_positions'], batch['end_positions'] else: input_ids, token_type_ids, labels = batch['input_ids'], batch[ 'token_type_ids'], batch['labels'] for width_mult in width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model( input_ids, token_type_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() if "cmrc2018" in task_name: logit_loss = (soft_cross_entropy(logits[0], teacher_logits[0].detach()) \ + \ soft_cross_entropy(logits[1], teacher_logits[1].detach()))/2 else: logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + lambda_logit * logit_loss loss.backward() self.optimizer.step() self.lr_scheduler.step() self.optimizer.clear_grad() if global_step % self.args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, self.args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if "cmrc2018" not in task_name and global_step % self.args.save_steps == 0: tic_eval = time.time() evaluate( teacher_model, criterion, eval_dataloader, width_mult=100) logger.info("eval done total : %s s" % (time.time() - tic_eval)) for idx, width_mult in enumerate(width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() acc = evaluate(ofa_model, criterion, eval_dataloader, width_mult) if acc > best_acc: best_acc = acc if paddle.distributed.get_rank() == 0: output_dir_width = os.path.join(output_dir, str(width_mult)) if not os.path.exists(output_dir_width): os.makedirs(output_dir_width) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir_width) logger.info("eval done total : %s s" % (time.time() - tic_eval)) if global_step > self.args.num_training_steps: if best_acc == 0.0: output_dir_width = os.path.join(output_dir, str(width_mult)) if not os.path.exists(output_dir_width): os.makedirs(output_dir_width) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir_width) logger.info("Best acc: %.4f" % (best_acc)) return ofa_model if "cmrc2018" in task_name: tic_eval = time.time() evaluate(teacher_model, criterion, eval_dataloader, width_mult=100) logger.info("eval done total : %s s" % (time.time() - tic_eval)) for idx, width_mult in enumerate(width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() acc = evaluate(ofa_model, criterion, eval_dataloader, width_mult) if acc > best_acc: best_acc = acc if paddle.distributed.get_rank() == 0: output_dir_width = os.path.join(output_dir, str(width_mult)) if not os.path.exists(output_dir_width): os.makedirs(output_dir_width) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir_width) logger.info("eval done total : %s s" % (time.time() - tic_eval)) logger.info("Best acc: %.4f" % (best_acc)) return ofa_model
# # 初始化一个loss_log 的实例,然后将其作为参数传递给fit # loss_log = LossCallback() # 调用飞桨框架的VisualDL模块,保存信息到目录中。 callback = paddle.callbacks.VisualDL(log_dir='visualdl_log_dir') model = paddle.Model(model) optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()) # model.prepare() # 配置模型 model.prepare( optim, paddle.nn.CrossEntropyLoss(), Accuracy(topk=(1, 2)) ) model.fit(train_loader, epochs=500, verbose=1, callbacks=callback ) # print(loss_log.losses) model.evaluate(train_dataset, batch_size=8, verbose=1) # model.save() model.save('inference_model', False)
parser.add_argument("--epochs", type=int, default=15, help="Number of epoches for training.") parser.add_argument("--device", type=str, default="gpu", choices=["cpu", "gpu"], help="Select cpu, gpu devices to train model.") parser.add_argument("--seed", type=int, default=1, help="Random seed for initialization.") parser.add_argument("--memory_length", type=int, default=128, help="Length of the retained previous heads.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Linear warmup proption over the training process.") parser.add_argument("--dataset", default="cail2019_scm", choices=["cail2019_scm"], type=str, help="The training dataset") parser.add_argument("--dropout", default=0.1, type=float, help="Dropout ratio of ernie_doc") parser.add_argument("--layerwise_decay", default=1.0, type=float, help="Layerwise decay ratio") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.",) # yapf: enable args = parser.parse_args() DATASET_INFO = { "cail2019_scm": (ErnieDocTokenizer, "dev", "test", Accuracy()), } def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) def init_memory(batch_size, memory_length, d_model, n_layers): return [ paddle.zeros([batch_size, memory_length, d_model], dtype="float32") for _ in range(n_layers) ]
self.linear2 = paddle.nn.Linear(in_features=120, out_features=84) self.linear3 = paddle.nn.Linear(in_features=84, out_features=10) def forward(self, x): # x = x.reshape((-1, 1, 28, 28)) x = self.conv1(x) x = F.relu(x) x = self.max_pool1(x) x = F.relu(x) x = self.conv2(x) x = self.max_pool2(x) x = paddle.flatten(x, start_axis=1, stop_axis=-1) x = self.linear1(x) x = F.relu(x) x = self.linear2(x) x = F.relu(x) x = self.linear3(x) return x from paddle.metric import Accuracy model = paddle.Model(LeNet()) # 用Model封装模型 optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()) # 配置模型 model.prepare(optim, paddle.nn.CrossEntropyLoss(), Accuracy()) # 训练模型 model.fit(train_dataset, epochs=2, batch_size=64, verbose=1) model.evaluate(test_dataset, batch_size=64, verbose=1)
parser.add_argument("--device", type=str, default="gpu", choices=["cpu", "gpu"], help="Select cpu, gpu devices to train model.") parser.add_argument("--seed", type=int, default=1, help="Random seed for initialization.") parser.add_argument("--memory_length", type=int, default=128, help="Length of the retained previous heads.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Linear warmup proption over the training process.") parser.add_argument("--dataset", default="c3", choices=["c3"], type=str, help="The training dataset") parser.add_argument("--layerwise_decay", default=0.8, type=float, help="Layerwise decay ratio") parser.add_argument("--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--gradient_accumulation_steps", default=4, type=int, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.",) # yapf: enable args = parser.parse_args() DATASET_INFO = { "c3": (ErnieDocTokenizer, "dev", "test", Accuracy()), } def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) def init_memory(batch_size, memory_length, d_model, n_layers): return [ paddle.zeros([batch_size, memory_length, d_model], dtype="float32") for _ in range(n_layers) ]