def main(): set_seed(random_seed) config = CONFIGS[model_type].from_pretrained(model_dir) tokenizer = TOKENIZERS[model_type].from_pretrained(model_dir, do_lower_case=True) # 创建dataset train_dataset, num_train_batch = load_dataset('train', tokenizer) dev_dataset, num_dev_batch = load_dataset('dev', tokenizer) test_dataset, num_test_batch = load_dataset('test', tokenizer) output_types, output_shapes = return_types_and_shapes(for_trainer=True) # 初始化trainer trainer = Trainer(model_type, output_types, output_shapes, use_xla=use_xla, use_torch_mode=use_torch_mode) # 构建模型 trainer.build_model( get_model_fn(model_type, config, num_classes=len(labels))) t_total = num_train_batch * epochs // gradient_accumulation_steps # 创建train op train_op = create_optimizer(init_lr=learning_rate, gradients=trainer.gradients, variables=trainer.variables, num_train_steps=t_total, num_warmup_steps=int(t_total * epochs * 0.1)) # 配置trainer,将train op、模型保存最大数量传入 trainer.compile(train_op, max_checkpoints=1) trainer.build_handle(train_dataset, 'train') trainer.build_handle(dev_dataset, 'dev') trainer.build_handle(test_dataset, 'test') # 预训练模型加载预训练参数,若不加载,调用trainer.init_variables() trainer.from_pretrained(model_dir) best_score = 0. tf.logging.info("***** Running training *****") tf.logging.info(" Num epochs = {}".format(epochs)) tf.logging.info(" batch size = {}".format(batch_size)) tf.logging.info(" Gradient Accumulation steps = {}".format( gradient_accumulation_steps)) tf.logging.info(" Total train batch size (accumulation) = {}".format( batch_size * gradient_accumulation_steps)) tf.logging.info(" optimizer steps = %d", t_total) tf.logging.info(" Num devices = {}".format(trainer.num_devices)) tf.logging.info(" Num params = {}".format(trainer.num_params)) for epoch in range(epochs): epoch_iter = bar_fn(range(num_train_batch), desc='epoch {} '.format(epoch + 1)) for step in epoch_iter: if use_torch_mode: train_loss = trainer.backward() else: train_loss = trainer.train_step() epoch_iter.set_description( desc='epoch {} ,loss {:.4f}'.format(epoch + 1, train_loss)) if (step + 1) % gradient_accumulation_steps == 0: if use_torch_mode: trainer.train_step() trainer.zero_grad() if trainer.global_step % logging_steps == 0 or trainer.global_step == t_total: y_true, y_pred = predict(trainer, num_dev_batch, 'dev') acc = accuracy_score(y_true, y_pred) if acc > best_score: best_score = acc trainer.save_pretrained(output_dir) config.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tf.logging.info("***** eval results *****") tf.logging.info(" global step : {}".format( trainer.global_step)) tf.logging.info(" eval accuracy : {:.4f}".format(acc)) tf.logging.info( " best accuracy : {:.4f}".format(best_score)) tf.logging.info("***** Running Test *****") trainer.from_pretrained(output_dir) y_true, y_pred = predict(trainer, num_test_batch, 'test') report = classification_report(y_true, y_pred, target_names=labels, digits=4) tf.logging.info("***** test results *****") report = report.split('\n') for r in report: tf.logging.info(r)
batch_size, set_type='dev') test_dataset, num_test_batch = create_dataset( "data/classification/test.csv", vocab2id, label2id, max_seq_length, batch_size, set_type='test') output_types = {"input_ids": tf.int32, 'label_ids': tf.int64} output_shapes = { "input_ids": tf.TensorShape([None, None]), 'label_ids': tf.TensorShape([None]) } trainer = Trainer('cnn', output_types, output_shapes, device='gpu') trainer.build_model(get_model_fn(max_seq_length)) t_total = num_train_batch * epochs // gradient_accumulation_steps train_op = create_optimizer(init_lr=learning_rate, gradients=trainer.gradients, variables=trainer.variables, num_train_steps=t_total, num_warmup_steps=t_total * 0.1) trainer.compile(train_op, max_checkpoints=1) trainer.build_handle(train_dataset, 'train') trainer.build_handle(dev_dataset, 'dev') trainer.build_handle(test_dataset, 'test')
from textToy import Trainer, BertConfig, SequenceClassification, BertTokenizer from textToy.data.classification import return_types_and_shapes, convert_examples_to_features, create_dataset_by_gen from run_ptm import labels, predict, create_examples, classification_report output_dir = 'ckpt/classification' model_type = 'bert' predict_file = "data/classification/test.csv" batch_size = 32 max_seq_length = 32 config = BertConfig.from_pretrained(output_dir) tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=True) output_types, output_shapes = return_types_and_shapes(for_trainer=True) trainer = Trainer(model_type, output_types, output_shapes) test_examples = create_examples(predict_file) test_features = convert_examples_to_features(test_examples, tokenizer, max_seq_length, labels, 'test') test_dataset, test_steps = create_dataset_by_gen(test_features, batch_size, 'test') def get_model_fn(): def model_fn(inputs, is_training): model = SequenceClassification(model_type=model_type, config=config, num_classes=len(labels), is_training=is_training, **inputs) return {'outputs': [model.logits, inputs['label_ids']]}
def main(): set_seed(random_seed) config = CONFIGS[model_type].from_pretrained(model_dir) tokenizer = TOKENIZERS[model_type].from_pretrained(model_dir, do_lower_case=True) train_dataset, num_train_batch = load_dataset('train', tokenizer) test_dataset, num_test_batch = load_dataset('test', tokenizer) output_types, output_shapes = return_types_and_shapes(for_trainer=True, is_multi_label=True) trainer = Trainer( model_type, output_types, output_shapes, device='gpu' ) trainer.build_model(get_model_fn(model_type, config, len(labels))) t_total = num_train_batch * epochs // gradient_accumulation_steps train_op = create_optimizer( init_lr=learning_rate, gradients=trainer.gradients, variables=trainer.variables, num_train_steps=t_total, num_warmup_steps=t_total * 0.1) trainer.compile( train_op, max_checkpoints=1) trainer.build_handle(train_dataset, 'train') trainer.build_handle(test_dataset, 'test') trainer.from_pretrained(model_dir) tf.logging.info("***** Running training *****") tf.logging.info(" Num epochs = {}".format(epochs)) tf.logging.info(" batch size = {}".format(batch_size)) tf.logging.info(" Gradient Accumulation steps = {}".format(gradient_accumulation_steps)) tf.logging.info(" Total train batch size (accumulation) = {}".format(batch_size * gradient_accumulation_steps)) tf.logging.info(" optimizer steps = %d", t_total) tf.logging.info(" Num devices = {}".format(trainer.num_devices)) tf.logging.info(" Num params = {}".format(trainer.num_params)) best_score = 0. for epoch in range(epochs): epoch_iter = bar_fn(range(num_train_batch), desc='epoch {} '.format(epoch + 1)) for step in epoch_iter: train_loss = trainer.backward() epoch_iter.set_description(desc='epoch {} ,loss {:.4f}'.format(epoch + 1, train_loss)) if (step + 1) % gradient_accumulation_steps == 0: trainer.train_step() trainer.zero_grad() y_true, y_pred = predict(trainer, num_test_batch, 'test') score = multi_label_metric(y_true, y_pred, label_list=labels)['dict_result']['micro macro avg']['f1-score'] if score > best_score: best_score = score trainer.save_pretrained(output_dir) config.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tf.logging.info("***** eval results *****") tf.logging.info(" global step : {}".format(trainer.global_step)) tf.logging.info(" eval score : {:.4f}".format(score)) tf.logging.info(" best score : {:.4f}".format(best_score)) tf.logging.info("***** Running Test *****") trainer.from_pretrained(output_dir) y_true, y_pred = predict(trainer, num_test_batch, 'test') report = multi_label_metric(y_true, y_pred, label_list=labels)['string_result'] open(os.path.join(output_dir, 'result.txt'), 'w', encoding='utf-8').write(report) tf.logging.info("***** test results *****") report = report.split('\n') for r in report: tf.logging.info(r)
def main(): set_seed(random_seed) config = CONFIGS[model_type].from_pretrained(bert_dir) tokenizer = TOKENIZERS[model_type].from_pretrained(bert_dir, do_lower_case=True) train_dataset, num_train_batch = load_dataset('train', tokenizer) dev_dataset, num_dev_batch = load_dataset('dev', tokenizer) test_dataset, num_test_batch = load_dataset('test', tokenizer) output_types, output_shapes = return_types_and_shapes(for_trainer=True) trainer = Trainer(model_type, output_types, output_shapes, device='gpu') trainer.build_model(get_model_fn(model_type, config, len(labels), add_crf)) t_total = num_train_batch * epochs // gradient_accumulation_steps train_op = create_optimizer(init_lr=learning_rate, gradients=trainer.gradients, variables=trainer.variables, num_train_steps=t_total, num_warmup_steps=t_total * 0.1) trainer.compile(train_op=train_op, max_checkpoints=1) trainer.build_handle(train_dataset, 'train') trainer.build_handle(dev_dataset, 'dev') trainer.build_handle(test_dataset, 'test') trainer.from_pretrained(bert_dir) best_score = 0. tf.logging.info("***** Running training *****") tf.logging.info(" Num epochs = {}".format(epochs)) tf.logging.info(" batch size = {}".format(batch_size)) tf.logging.info(" Gradient Accumulation steps = {}".format( gradient_accumulation_steps)) tf.logging.info(" Total train batch size (accumulation) = {}".format( batch_size * gradient_accumulation_steps)) tf.logging.info(" optimizer steps = %d", t_total) tf.logging.info(" Num devices = {}".format(trainer.num_devices)) tf.logging.info(" Num params = {}".format(trainer.num_params)) for epoch in range(epochs): epoch_iter = bar_fn(range(num_train_batch), desc='epoch {} '.format(epoch + 1)) for step in epoch_iter: train_loss = trainer.backward() epoch_iter.set_description( desc='epoch {} ,loss {:.4f}'.format(epoch + 1, train_loss)) if (step + 1) % gradient_accumulation_steps == 0: trainer.train_step() trainer.zero_grad() if trainer.global_step % logging_steps == 0 or trainer.global_step == t_total: y_true, y_pred = predict(trainer, num_dev_batch, 'dev') p, r, f = prf_score(y_true, y_pred) if f > best_score: best_score = f trainer.save_pretrained(output_dir) config.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tf.logging.info("***** eval results *****") tf.logging.info(" global step : {}".format( trainer.global_step)) tf.logging.info(" eval precision score : {:.4f}".format(p)) tf.logging.info(" eval recall score : {:.4f}".format(r)) tf.logging.info(" eval f1 score : {:.4f}".format(f)) tf.logging.info( " best f1 score : {:.4f}".format(best_score)) tf.logging.info("***** Running Test *****") trainer.from_pretrained(output_dir) y_true, y_pred = predict(trainer, num_test_batch, 'test') report = ner_report(y_true, y_pred) tf.logging.info("***** test results *****") report = report.split('\n') for r in report: tf.logging.info(r)