Пример #1
0
 def test_TFBertForTokenClassification(self):
     from transformers import BertConfig, TFBertForTokenClassification
     keras.backend.clear_session()
     # pretrained_weights = 'bert-base-uncased'
     tokenizer_file = 'bert_bert-base-uncased.pickle'
     tokenizer = self._get_tokenzier(tokenizer_file)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     config = BertConfig()
     model = TFBertForTokenClassification(config)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
Пример #2
0
def load_model(num_labels, trainable=True):
    # Use BertForTokenClassification
    model = TFBertForTokenClassification.from_pretrained(
        "bert-base-multilingual-cased", num_labels=num_labels)
    model.trainable = trainable

    return model
Пример #3
0
 def build_model(self,
                 max_length,
                 train_batch_size,
                 learning_rate,
                 epochs,
                 num_labels,
                 tagset=None,
                 gpu_growth=True,
                 eval_batch_size=32):
     #if gpu_growth:
     #    model_utils.set_tf_memory_growth()
     if self.task == "pos":
         self.model = TFBertForTokenClassification.from_pretrained(
             self.model_name, num_labels=num_labels, from_pt=True)
         self.tokenizer = MBERT_Tokenizer_pos.from_pretrained(
             self.model_name, do_lower_case=False)
     else:
         self.model = TFBertForSequenceClassification.from_pretrained(
             self.model_name, num_labels=num_labels, from_pt=True)
         self.tokenizer = BertTokenizer.from_pretrained(self.model_name,
                                                        do_lower_case=False)
     #self.model, self.tokenizer = model_utils.create_model(self.short_model_name, self.task, num_labels)
     self.model = model_utils.compile_model(self.model, self.task,
                                            learning_rate)
     print("Successfully built", self.model_name)
     self.max_length = max_length
     self.train_batch_size = train_batch_size
     self.learning_rate = learning_rate
     self.epochs = epochs
     self.num_labels = num_labels
     if tagset:
         self.tagset = tagset
         self.label_map = {label: i for i, label in enumerate(tagset)}
     self.eval_batch_size = eval_batch_size
Пример #4
0
def build_model(pretrained_model_name_or_path, num_labels):
    config = BertConfig.from_pertrained(pretrained_model_name_or_path,
                                        num_labels=num_labels)
    model = TFBertForTokenClassification.from_pretrained(
        pretrained_model_name_or_path, config=config)
    model.layers[-1].activation = tf.keras.activations.softmax
    return model
Пример #5
0
def get_model():
    config = BertConfig.from_pretrained('bert-base-multilingual-cased',
                                        num_labels=3)
    model = TFBertForTokenClassification.from_pretrained(
        "bert-base-multilingual-cased", config=config)
    model.layers[-1].activation = tf.keras.activations.softmax
    print(model.summary())
    return model
Пример #6
0
    def __init__(self):
        super(MyModel, self).__init__()
        self.bert = TFBertForTokenClassification.from_pretrained(
            'bert-base-chinese', return_dict=True, num_labels=7)

        # TFBertForTokenClassification by yourself
        # self.bert = TFBertModel.from_pretrained('bert-base-chinese')
        # self.dropout = tf.keras.layers.Dropout(0.1)
        # self.classifier = tf.keras.layers.Dense(7, name="classifier")
        pass
Пример #7
0
def getBertModel():
    config.num_labels = 2
    bertModel = TFBertForTokenClassification.from_pretrained(PRETRAINED_MODEL,
                                                             config=config)
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,
                                         epsilon=1e-08,
                                         clipnorm=1.0)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    bertModel.compile(optimizer=optimizer, loss=loss)

    return bertModel
Пример #8
0
 def test_TFBertForTokenClassification(self):
     from transformers import BertTokenizer, TFBertForTokenClassification
     pretrained_weights = 'bert-base-uncased'
     tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFBertForTokenClassification.from_pretrained(
         pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx,
                          predictions, self.model_files))
Пример #9
0
def load_saved_model_test(eval_batch_size=32,
                          model_path="96_64",
                          file_path=os.path.join(c.PROCESSED_DATASET_DIR,
                                                 c.TEST_FILE)):
    """Create Features & Tokenize"""
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                              do_lower_case=True)

    trained_model = TFBertForTokenClassification.from_pretrained(model_path)

    optimizer = tf.keras.optimizers.Adam()

    metrics = [
        keras.metrics.SparseCategoricalAccuracy('micro_f1/cat_accuracy',
                                                dtype=tf.float32), macro_f1
    ]
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    logging.info("Compiling Model ...")

    trained_model.compile(optimizer=optimizer,
                          loss=loss,
                          metrics=metrics,
                          run_eagerly=True)

    test_data, test_inputs, test_labels = extract_features.retrieve_pred_features(
        file_path, c.LABELS, c.MAX_SEQ_LENGTH, tokenizer, c.LABEL_ID_PKL_FILE)
    # # Test Scores
    # test_loss, test_acc, test_f1_macro = trained_model.evaluate(test_data, batch_size=eval_batch_size)
    # logging.info(str({"Loss": test_loss, "Micro F1/Accuracy": test_acc, "Macro F1": test_f1_macro}))

    # evaluate model with sklearn
    predictions = np.argmax(trained_model.predict(test_data,
                                                  batch_size=eval_batch_size,
                                                  verbose=1).logits,
                            axis=-1)
    print(np.shape(predictions), np.shape(test_labels))
    sk_report, macro_f1_score, micro_f1_score, macro_recall_score, macro_precision_score = calculate_pred_metrics(
        test_labels, predictions)

    print('\n', sk_report)
    logging.info(sk_report)

    logging.info("****TEST METRICS****")
    metrics_dict = {
        "Macro_F1": macro_f1_score,
        "Micro_F1": micro_f1_score,
        "Macro_Precision": macro_precision_score,
        "Macro_Recall": macro_recall_score
    }
    logging.info(str(metrics_dict))
    return f'bert_eval_{macro_f1_score}_{uuid.uuid4()}'
Пример #10
0
def load(name):
    try:
        with open(f'./app/main/models/utils/{name}_idx2tag.pickle', 'rb') as handle:
            idx2tag = pickle.load(handle)
    except FileNotFoundError:
        idx2tag = None

    if name == 'ner':
        model = TFBertForTokenClassification.from_pretrained(
            'bert-base-uncased', 
            num_labels=len(idx2tag) 
        )
    
    with open(f'./app/main/models/weights/{name}.pickle', 'rb') as handle:
        model.set_weights(pickle.load(handle))
    
    with open(f'./app/main/models/tokenizers/{name}.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    
    return model, tokenizer, idx2tag
Пример #11
0
import tensorflow as tf
import transformers
from transformers import TFBertForTokenClassification, TFXLMRobertaForTokenClassification

if __name__ == "__main__":

    model = TFBertForTokenClassification.from_pretrained(
        "../norbert3/model.ckpt-1060000.data-00000-of-00001", from_tf=True)
        con1 = (pred_flat == 1)
        con2 = (labels_flat == 1)

        part = np.where(con1 & con2)
        correct = len(part[0])
        sum_pred = np.sum(pred_flat)
        sum_true = np.sum(labels_flat)

        return correct, sum_pred, sum_true

    def get(self):
        return self.reports


# model configuration
bertModel = TFBertForTokenClassification.from_pretrained(PRETRAINED_MODEL,
                                                         num_labels=2)

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5,
                                     epsilon=1e-08,
                                     clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True)  # labels as integer
# loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)  # only two labels (one-hot)
# loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True) # two or more one-hot encoding
# metric = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy'), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
eval_metrics = EvaluateModel(train_x, tr_tags, val_x, val_tags)
# bertModel.compile(optimizer=optimizer, loss=loss, metrics=metric)
bertModel.compile(optimizer=optimizer, loss=loss)

# training
# bertModel.fit(x=train_x, y=train_y, epochs=EPOCHS, callbacks=[eval_metrics])
Пример #13
0
from transformers import BertTokenizer, TFBertForTokenClassification, BertForTokenClassification
import tensorflow as tf
import numpy as np
import glob
import loadData

tf.random.set_seed(2019)
np.random.seed(2019)

MAX_TOKEN = 256
PRETRAINED_MODEL = 'bert-base-uncased'
val_path = './SemEval18_Task12/Training/Validation_Data_Codalab/detection'

tf_model = TFBertForTokenClassification.from_pretrained('./save/')
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)

token_inputs = tf.keras.Input(shape=(None, ), name='input_ids', dtype='int32')
mask_inputs = tf.keras.Input(shape=(None, ),
                             name='attention_mask',
                             dtype='int32')
segment_inputs = tf.keras.Input(shape=(None, ),
                                name='token_type_ids',
                                dtype='int32')

annFiles = glob.glob(val_path + '/*.ann')

with open('./result/result.html', 'w') as out:
    for annFile in annFiles:
        txtFile = annFile.replace('.ann', '.txt')
        lastIdx = 0
        print(txtFile)
Пример #14
0
def train_test(epochs,
               eval_batch_size,
               epsilon=1e-7,
               init_lr=2e-5,
               beta_1=0.9,
               beta_2=0.999):
    mlflow.log_params({
        "epochs": epochs,
        "eval_batch_size": eval_batch_size,
        "epsilon": epsilon,
        "init_lr": init_lr,
        "beta_1": beta_1,
        "beta_2": beta_2
    })
    """Create Features & Tokenize"""
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                              do_lower_case=True)
    train_data = extract_features.retrieve_features(c.TRAIN_FILE, c.LABELS,
                                                    c.MAX_SEQ_LENGTH,
                                                    tokenizer,
                                                    c.LABEL_ID_PKL_FILE)

    config = BertConfig.from_pretrained('bert-base-multilingual-cased',
                                        num_labels=len(c.LABELS))
    model = TFBertForTokenClassification.from_pretrained(
        "bert-base-multilingual-cased", config=config)
    model.summary()

    model.layers[-1].activation = tf.keras.activations.softmax
    optimizer = tf.keras.optimizers.Adam(learning_rate=init_lr,
                                         epsilon=epsilon,
                                         beta_1=beta_1,
                                         beta_2=beta_2)

    metrics = [
        keras.metrics.SparseCategoricalAccuracy('micro_f1/cat_accuracy',
                                                dtype=tf.float32), macro_f1
    ]
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    logging.info("Compiling Model ...")

    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=metrics,
                  run_eagerly=True)

    logging.info("Model has been compiled")

    val_data, val_inputs, val_labels = extract_features.retrieve_features(
        c.VALIDATION_FILE, c.LABELS, c.MAX_SEQ_LENGTH, tokenizer,
        c.LABEL_ID_PKL_FILE)
    test_data, test_inputs, test_labels = extract_features.retrieve_features(
        c.TEST_FILE, c.LABELS, c.MAX_SEQ_LENGTH, tokenizer,
        c.LABEL_ID_PKL_FILE)

    logging.info("Test Validation features are ready")

    f1_metric = EvalMetrics(val_data, val_labels, eval_batch_size)
    model.fit(train_data,
              epochs=epochs,
              validation_data=val_data,
              callbacks=[f1_metric])

    logging.info("Model Fitting is done")

    # Save Model
    save_dir_path = os.path.join(c.FINAL_OUTPUT_DIR,
                                 "model_" + str(time.time()))
    os.mkdir(save_dir_path)
    # tf.saved_model.save(model, export_dir=save_dir_path)
    model.save_pretrained(save_dir_path, saved_model=True)
    logging.info("Model Saved at: {}".format(save_dir_path))

    # Test Scores
    test_loss, test_acc, test_f1_macro = model.evaluate(
        test_data, batch_size=eval_batch_size)
    logging.info(
        str({
            "Loss": test_loss,
            "Micro F1/Accuracy": test_acc,
            "Macro F1": test_f1_macro
        }))

    # evaluate model with sklearn
    predictions = np.argmax(model.predict(test_data,
                                          batch_size=eval_batch_size,
                                          verbose=1).logits,
                            axis=-1)
    print(np.shape(predictions), np.shape(test_labels))
    sk_report, macro_f1_score, micro_f1_score, macro_recall_score, macro_precision_score = calculate_pred_metrics(
        test_labels, predictions)

    print('\n', sk_report)
    logging.info(sk_report)

    logging.info("****TEST METRICS****")
    metrics_dict = {
        "Loss": test_loss,
        "CatAcc": test_acc,
        "Macro_F1": macro_f1_score,
        "Micro_F1": micro_f1_score,
        "Macro_Precision": macro_precision_score,
        "Macro_Recall": macro_recall_score
    }
    logging.info(str(metrics_dict))
    mlflow.log_metrics(metrics_dict)

    return save_dir_path, [
        f'epochs:{epochs}', f'eval_batch_size: {eval_batch_size}',
        f'epsilon: {epsilon}', f'init_lr: {init_lr}', f'beta_1: {beta_1}',
        f'beta_2: {beta_2}'
    ], f'bert_{test_acc}_{macro_f1_score}_{uuid.uuid4()}'
Пример #15
0
import tensorflow as tf
import numpy as np
from transformers import BertTokenizer, TFBertForTokenClassification
from model.model import Model
''' parameters '''
model_name = 'bert-base-cased'
num_labels = 2
max_length = 64
weights_path = 'model/saved_weights/weights.0.21.h5'
''' load model '''
tokenizer = BertTokenizer.from_pretrained(model_name)
model = Model(
    TFBertForTokenClassification.from_pretrained(model_name,
                                                 num_labels=num_labels)
)  # need to optimize this step by loading config instead of weights
model(tf.zeros([1, 3, max_length], tf.int32))
model.load_weights(weights_path)
model.compile(run_eagerly=True)
''' score passages '''
TEXT = "The origin of the name Moabit is disputed. According to one account, \
it can be traced back to the Huguenots, in the time of King Frederick William I of Prussia. \
These French refugees are said to have named their new residence in reference \
to the Biblical description of the Israelites in the country of Moab, where they \
stayed before being allowed to enter Canaan. Other possible origins include \
the German (Berlin dialect) \"Moorjebiet\" (swamp area). "

inputs_ = tokenizer.encode_plus(text=TEXT,
                                max_length=max_length,
                                pad_to_max_length=True,
                                return_token_type_ids=True,
                                return_attention_mask=True)
Пример #16
0
 def __init__(self, model_path, num_tags, dropout):
     self.model_path = model_path
     self.num_tags = num_tags
     self.dropout = dropout
     self.encoder = TFBertForTokenClassification.from_pretrained(
         self.model_path)
Пример #17
0
config.num_labels = 2

# maxTokenLen, sent_tokens, sent_token_ids, sent_token_spans, sent_token_tags = \
#     alignSpansBySentence(train_sentences, train_spans)
# input_ids, input_masks, input_tags, token_spans = \
#     chunkData(maxTokenLen, sent_tokens, sent_token_ids, sent_token_spans, sent_token_tags)

train_x, train_y = buildData(train_sentences, train_spans)
train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
print('train data loaded:({0})'.format(len(train_y)))

val_x, val_y = buildData(val_sentences, val_spans)
val_dataset = tf.data.Dataset.from_tensor_slices((val_x, val_y))
print('validation data loaded:({0})'.format(len(val_y)))

model = TFBertForTokenClassification.from_pretrained('bert-base-uncased',
                                                     config=config)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5,
                                       epsilon=1e-08,
                                       clipnorm=1.0),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])

model.fit(train_dataset.shuffle(len(train_y)).batch(BATCH_SIZE),
          validation_data=val_dataset.batch(BATCH_SIZE),
          epochs=EPOCHS,
          batch_size=BATCH_SIZE)

model.save_weights('/content/drive/MyDrive/ncg/model-SI-BERT/')
print('*****************model saved!******************')
Пример #18
0
def main():
    args = model_params()
    tag2id_path = os.path.join(args["output_path"], args["tag2id"])

    if not os.path.exists(args["output_path"]):
        os.makedirs(args["output_path"])
    if not os.path.join(args["pb_path"]):
        os.makedirs(args["pb_path"])
    max_len = args["max_len"]
    batch_size = args["batch_size"]
    epoch = args["epoch"]
    # load data
    train_data, train_label_ori, tag2id, train_len = load_data(args["train_file"])
    print("train data size: ", len(train_data))
    print("train label size: ", len(train_label_ori))
    print("label dict: ", tag2id)
    dev_data, dev_label_ori, _, dev_len = load_data(args["dev_file"])
    print("dev data size: ", len(dev_data))
    print("dev label size: ", len(dev_label_ori))

    # save tag2id
    save_dict(tag2id, tag2id_path)
    # label encoder
    train_label = label_encoder(train_label_ori, tag2id)
    print("train label: ", train_label[:3])
    dev_label = label_encoder(dev_label_ori, tag2id)
    print("dev label: ", dev_label[:3])
    # get tokenizer
    tokenizer = get_tokenizer(args["pretrain_model_path"])
    # tokenizer = get_roberta_tokenizer()
    # 准备模型数据
    train_x, train_y = create_inputs_targets_roberta(train_data, train_label,
                                                     tag2id, max_len, tokenizer)
    dev_x, dev_y = create_inputs_targets_roberta(dev_data, dev_label,
                                                 tag2id, max_len, tokenizer)

    # create model bert
    model = TFBertForTokenClassification.from_pretrained(args["pretrain_model_path"],
                                                         from_pt=True,
                                                         num_labels=len(list(tag2id.keys())))
    # optimizer Adam
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08)
    # we do not have one-hot vectors, we can use sparse categorical cross entropy and accuracy
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    model.summary()
    model.fit(train_x,
              train_y,
              epochs=epoch,
              verbose=1,
              batch_size=batch_size,
              validation_data=(dev_x, dev_y),
              validation_batch_size=batch_size
             )   # , validation_split=0.1

    # model save
    model_file = os.path.join(args["output_path"], "ner_model.h5")
    model.save_weights(model_file, overwrite=True)

    # save pb model
    tf.keras.models.save_model(model, args["pb_path"], save_format="tf")

    # 模型评价
    precision, recall, f1 = model_evaluate_roberta(model, dev_x, dev_label_ori,
                                                   tag2id, batch_size, dev_len)
    logger.info("model precision:{} recall:{} f1:{}".format(precision, recall, f1))