Пример #1
0
def get_estimator(max_len=20,
                  epochs=10,
                  batch_size=64,
                  max_train_steps_per_epoch=None,
                  max_eval_steps_per_epoch=None,
                  pretrained_model='bert-base-uncased',
                  save_dir=tempfile.mkdtemp(),
                  data_dir=None):
    # step 1 prepare data
    train_data, eval_data, data_vocab, label_vocab = german_ner.load_data(root_dir=data_dir)
    tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lower_case=True)
    tag2idx = char2idx(label_vocab)
    pipeline = fe.Pipeline(
        train_data=train_data,
        eval_data=eval_data,
        batch_size=batch_size,
        ops=[
            Tokenize(inputs="x", outputs="x", tokenize_fn=tokenizer.tokenize),
            WordtoId(inputs="x", outputs="x", mapping=tokenizer.convert_tokens_to_ids),
            WordtoId(inputs="y", outputs="y", mapping=tag2idx),
            PadSequence(max_len=max_len, inputs="x", outputs="x"),
            PadSequence(max_len=max_len, value=len(tag2idx), inputs="y", outputs="y"),
            AttentionMask(inputs="x", outputs="x_masks")
        ])

    # step 2. prepare model
    bert_config = BertConfig.from_pretrained(pretrained_model)
    num_hidden_layers = bert_config.to_dict()['num_hidden_layers']
    head_masks = [None] * num_hidden_layers
    model = fe.build(model_fn=lambda: NERModel(head_masks=head_masks, pretrained_model=pretrained_model),
                     optimizer_fn=lambda x: torch.optim.Adam(x, lr=1e-5))
    network = fe.Network(ops=[
        ModelOp(model=model, inputs=["x", "x_masks"], outputs="y_pred"),
        Reshape(inputs="y", outputs="y", shape=(-1, )),
        Reshape(inputs="y_pred", outputs="y_pred", shape=(-1, 24)),
        CrossEntropy(inputs=("y_pred", "y"), outputs="loss"),
        UpdateOp(model=model, loss_name="loss")
    ])

    traces = [Accuracy(true_key="y", pred_key="y_pred"), BestModelSaver(model=model, save_dir=save_dir)]

    # step 3 prepare estimator
    estimator = fe.Estimator(network=network,
                             pipeline=pipeline,
                             epochs=epochs,
                             traces=traces,
                             max_train_steps_per_epoch=max_train_steps_per_epoch,
                             max_eval_steps_per_epoch=max_eval_steps_per_epoch)

    return estimator
Пример #2
0
 def test_lower_case(self):
     op = Tokenize(inputs='x', outputs='x', to_lower_case=True)
     data = op.forward(data=self.lower_case_input, state={})
     self.assertTrue(
         is_equal(data, [['to', 'test', 'lowercase', 'parameter']]))
Пример #3
0
 def test_multi_input(self):
     op = Tokenize(inputs='x', outputs='x')
     data = op.forward(data=self.multi_input, state={})
     self.assertTrue(is_equal(data, self.multi_output))
Пример #4
0
 def test_single_input_tokenize_function(self):
     op = Tokenize(inputs='x', outputs='x', tokenize_fn=self.tokenize_fn)
     data = op.forward(data=self.single_input, state={})
     self.assertTrue(is_equal(data, self.tokenize_fn_output))