Пример #1
0
def main():
    set_seed(random_seed)
    config = CONFIGS[model_type].from_pretrained(model_dir)
    tokenizer = TOKENIZERS[model_type].from_pretrained(model_dir,
                                                       do_lower_case=True)

    # 创建dataset
    train_dataset, num_train_batch = load_dataset('train', tokenizer)
    dev_dataset, num_dev_batch = load_dataset('dev', tokenizer)
    test_dataset, num_test_batch = load_dataset('test', tokenizer)

    output_types, output_shapes = return_types_and_shapes(for_trainer=True)

    # 初始化trainer
    trainer = Trainer(model_type,
                      output_types,
                      output_shapes,
                      use_xla=use_xla,
                      use_torch_mode=use_torch_mode)

    # 构建模型
    trainer.build_model(
        get_model_fn(model_type, config, num_classes=len(labels)))

    t_total = num_train_batch * epochs // gradient_accumulation_steps
    # 创建train op
    train_op = create_optimizer(init_lr=learning_rate,
                                gradients=trainer.gradients,
                                variables=trainer.variables,
                                num_train_steps=t_total,
                                num_warmup_steps=int(t_total * epochs * 0.1))

    # 配置trainer,将train op、模型保存最大数量传入
    trainer.compile(train_op, max_checkpoints=1)

    trainer.build_handle(train_dataset, 'train')
    trainer.build_handle(dev_dataset, 'dev')
    trainer.build_handle(test_dataset, 'test')

    # 预训练模型加载预训练参数,若不加载,调用trainer.init_variables()
    trainer.from_pretrained(model_dir)

    best_score = 0.

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num epochs = {}".format(epochs))
    tf.logging.info("  batch size = {}".format(batch_size))
    tf.logging.info("  Gradient Accumulation steps = {}".format(
        gradient_accumulation_steps))
    tf.logging.info("  Total train batch size (accumulation) = {}".format(
        batch_size * gradient_accumulation_steps))
    tf.logging.info("  optimizer steps = %d", t_total)
    tf.logging.info("  Num devices = {}".format(trainer.num_devices))
    tf.logging.info("  Num params = {}".format(trainer.num_params))

    for epoch in range(epochs):
        epoch_iter = bar_fn(range(num_train_batch),
                            desc='epoch {} '.format(epoch + 1))
        for step in epoch_iter:
            if use_torch_mode:
                train_loss = trainer.backward()
            else:
                train_loss = trainer.train_step()
            epoch_iter.set_description(
                desc='epoch {} ,loss {:.4f}'.format(epoch + 1, train_loss))

            if (step + 1) % gradient_accumulation_steps == 0:
                if use_torch_mode:
                    trainer.train_step()
                    trainer.zero_grad()

                if trainer.global_step % logging_steps == 0 or trainer.global_step == t_total:
                    y_true, y_pred = predict(trainer, num_dev_batch, 'dev')
                    acc = accuracy_score(y_true, y_pred)
                    if acc > best_score:
                        best_score = acc
                        trainer.save_pretrained(output_dir)
                        config.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                    tf.logging.info("***** eval results *****")
                    tf.logging.info(" global step : {}".format(
                        trainer.global_step))
                    tf.logging.info(" eval accuracy : {:.4f}".format(acc))
                    tf.logging.info(
                        " best accuracy : {:.4f}".format(best_score))

    tf.logging.info("***** Running Test *****")
    trainer.from_pretrained(output_dir)
    y_true, y_pred = predict(trainer, num_test_batch, 'test')
    report = classification_report(y_true,
                                   y_pred,
                                   target_names=labels,
                                   digits=4)
    tf.logging.info("***** test results *****")
    report = report.split('\n')
    for r in report:
        tf.logging.info(r)
Пример #2
0
                                                batch_size,
                                                set_type='dev')
    test_dataset, num_test_batch = create_dataset(
        "data/classification/test.csv",
        vocab2id,
        label2id,
        max_seq_length,
        batch_size,
        set_type='test')
    output_types = {"input_ids": tf.int32, 'label_ids': tf.int64}
    output_shapes = {
        "input_ids": tf.TensorShape([None, None]),
        'label_ids': tf.TensorShape([None])
    }

    trainer = Trainer('cnn', output_types, output_shapes, device='gpu')

    trainer.build_model(get_model_fn(max_seq_length))

    t_total = num_train_batch * epochs // gradient_accumulation_steps

    train_op = create_optimizer(init_lr=learning_rate,
                                gradients=trainer.gradients,
                                variables=trainer.variables,
                                num_train_steps=t_total,
                                num_warmup_steps=t_total * 0.1)

    trainer.compile(train_op, max_checkpoints=1)
    trainer.build_handle(train_dataset, 'train')
    trainer.build_handle(dev_dataset, 'dev')
    trainer.build_handle(test_dataset, 'test')
Пример #3
0
from textToy import Trainer, BertConfig, SequenceClassification, BertTokenizer
from textToy.data.classification import return_types_and_shapes, convert_examples_to_features, create_dataset_by_gen
from run_ptm import labels, predict, create_examples, classification_report

output_dir = 'ckpt/classification'
model_type = 'bert'
predict_file = "data/classification/test.csv"
batch_size = 32
max_seq_length = 32

config = BertConfig.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=True)

output_types, output_shapes = return_types_and_shapes(for_trainer=True)

trainer = Trainer(model_type, output_types, output_shapes)

test_examples = create_examples(predict_file)
test_features = convert_examples_to_features(test_examples, tokenizer, max_seq_length, labels, 'test')
test_dataset, test_steps = create_dataset_by_gen(test_features, batch_size, 'test')


def get_model_fn():
    def model_fn(inputs, is_training):
        model = SequenceClassification(model_type=model_type,
                                       config=config,
                                       num_classes=len(labels),
                                       is_training=is_training,
                                       **inputs)
        return {'outputs': [model.logits, inputs['label_ids']]}
Пример #4
0
def main():
    set_seed(random_seed)
    config = CONFIGS[model_type].from_pretrained(model_dir)
    tokenizer = TOKENIZERS[model_type].from_pretrained(model_dir, do_lower_case=True)

    train_dataset, num_train_batch = load_dataset('train', tokenizer)
    test_dataset, num_test_batch = load_dataset('test', tokenizer)

    output_types, output_shapes = return_types_and_shapes(for_trainer=True, is_multi_label=True)

    trainer = Trainer(
        model_type, output_types, output_shapes, device='gpu'
    )

    trainer.build_model(get_model_fn(model_type, config, len(labels)))

    t_total = num_train_batch * epochs // gradient_accumulation_steps

    train_op = create_optimizer(
        init_lr=learning_rate,
        gradients=trainer.gradients,
        variables=trainer.variables,
        num_train_steps=t_total,
        num_warmup_steps=t_total * 0.1)

    trainer.compile(
        train_op, max_checkpoints=1)

    trainer.build_handle(train_dataset, 'train')
    trainer.build_handle(test_dataset, 'test')

    trainer.from_pretrained(model_dir)

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num epochs = {}".format(epochs))
    tf.logging.info("  batch size = {}".format(batch_size))
    tf.logging.info("  Gradient Accumulation steps = {}".format(gradient_accumulation_steps))
    tf.logging.info("  Total train batch size (accumulation) = {}".format(batch_size * gradient_accumulation_steps))
    tf.logging.info("  optimizer steps = %d", t_total)
    tf.logging.info("  Num devices = {}".format(trainer.num_devices))
    tf.logging.info("  Num params = {}".format(trainer.num_params))

    best_score = 0.
    for epoch in range(epochs):
        epoch_iter = bar_fn(range(num_train_batch), desc='epoch {} '.format(epoch + 1))
        for step in epoch_iter:
            train_loss = trainer.backward()
            epoch_iter.set_description(desc='epoch {} ,loss {:.4f}'.format(epoch + 1, train_loss))

            if (step + 1) % gradient_accumulation_steps == 0:
                trainer.train_step()
                trainer.zero_grad()

        y_true, y_pred = predict(trainer, num_test_batch, 'test')
        score = multi_label_metric(y_true, y_pred, label_list=labels)['dict_result']['micro macro avg']['f1-score']
        if score > best_score:
            best_score = score
            trainer.save_pretrained(output_dir)
            config.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
        tf.logging.info("***** eval results *****")
        tf.logging.info(" global step : {}".format(trainer.global_step))
        tf.logging.info(" eval score : {:.4f}".format(score))
        tf.logging.info(" best score : {:.4f}".format(best_score))

    tf.logging.info("***** Running Test *****")
    trainer.from_pretrained(output_dir)
    y_true, y_pred = predict(trainer, num_test_batch, 'test')
    report = multi_label_metric(y_true, y_pred, label_list=labels)['string_result']
    open(os.path.join(output_dir, 'result.txt'), 'w', encoding='utf-8').write(report)
    tf.logging.info("***** test results *****")
    report = report.split('\n')
    for r in report:
        tf.logging.info(r)
Пример #5
0
def main():
    set_seed(random_seed)

    config = CONFIGS[model_type].from_pretrained(bert_dir)
    tokenizer = TOKENIZERS[model_type].from_pretrained(bert_dir,
                                                       do_lower_case=True)

    train_dataset, num_train_batch = load_dataset('train', tokenizer)
    dev_dataset, num_dev_batch = load_dataset('dev', tokenizer)
    test_dataset, num_test_batch = load_dataset('test', tokenizer)

    output_types, output_shapes = return_types_and_shapes(for_trainer=True)

    trainer = Trainer(model_type, output_types, output_shapes, device='gpu')

    trainer.build_model(get_model_fn(model_type, config, len(labels), add_crf))

    t_total = num_train_batch * epochs // gradient_accumulation_steps

    train_op = create_optimizer(init_lr=learning_rate,
                                gradients=trainer.gradients,
                                variables=trainer.variables,
                                num_train_steps=t_total,
                                num_warmup_steps=t_total * 0.1)

    trainer.compile(train_op=train_op, max_checkpoints=1)

    trainer.build_handle(train_dataset, 'train')
    trainer.build_handle(dev_dataset, 'dev')
    trainer.build_handle(test_dataset, 'test')

    trainer.from_pretrained(bert_dir)

    best_score = 0.

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num epochs = {}".format(epochs))
    tf.logging.info("  batch size = {}".format(batch_size))
    tf.logging.info("  Gradient Accumulation steps = {}".format(
        gradient_accumulation_steps))
    tf.logging.info("  Total train batch size (accumulation) = {}".format(
        batch_size * gradient_accumulation_steps))
    tf.logging.info("  optimizer steps = %d", t_total)
    tf.logging.info("  Num devices = {}".format(trainer.num_devices))
    tf.logging.info("  Num params = {}".format(trainer.num_params))

    for epoch in range(epochs):
        epoch_iter = bar_fn(range(num_train_batch),
                            desc='epoch {} '.format(epoch + 1))
        for step in epoch_iter:
            train_loss = trainer.backward()
            epoch_iter.set_description(
                desc='epoch {} ,loss {:.4f}'.format(epoch + 1, train_loss))

            if (step + 1) % gradient_accumulation_steps == 0:
                trainer.train_step()
                trainer.zero_grad()
                if trainer.global_step % logging_steps == 0 or trainer.global_step == t_total:
                    y_true, y_pred = predict(trainer, num_dev_batch, 'dev')
                    p, r, f = prf_score(y_true, y_pred)
                    if f > best_score:
                        best_score = f
                        trainer.save_pretrained(output_dir)
                        config.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                    tf.logging.info("***** eval results *****")
                    tf.logging.info(" global step : {}".format(
                        trainer.global_step))
                    tf.logging.info(" eval precision score : {:.4f}".format(p))
                    tf.logging.info(" eval recall score : {:.4f}".format(r))
                    tf.logging.info(" eval f1 score : {:.4f}".format(f))
                    tf.logging.info(
                        " best f1 score : {:.4f}".format(best_score))

    tf.logging.info("***** Running Test *****")
    trainer.from_pretrained(output_dir)
    y_true, y_pred = predict(trainer, num_test_batch, 'test')
    report = ner_report(y_true, y_pred)
    tf.logging.info("***** test results *****")
    report = report.split('\n')
    for r in report:
        tf.logging.info(r)