예제 #1
0
파일: main.py 프로젝트: Vniex/NER
def train():
    train_data, train_label, word2id, word_embedding, max_sentence_len = load_all(settings.TRAIN_PATH,settings.VOCAB_PATH,
                                                                                  settings.VOCAB_EMBEDDING_PATH)
    # test no embedding
    # word_embedding=np.random.uniform(-0.25,0.25,word_embedding.shape)
    ner_model = NerModel(word2id, word_embedding, settings.TAGS, max_sentence_len, settings.EMBEDDING_SIZE)
    ner_model.train(train_data, train_label, save_path=settings.MODEL_PATH)
예제 #2
0
#Preprocessing Data
data_train = load_data()
getter = modifying(data_train)
getter.get_next()

tag2id,n_tags,word2id,n_words = getter.indexing()
text_sequences,label_sequences = getter.padding(args.max_len,word2id,tag2id) # making length of all sentences to be equal

train_dataset = tf.data.Dataset.from_tensor_slices((text_sequences, label_sequences)) # converting to tensorflow dataset
train_dataset = train_dataset.shuffle(len(text_sequences)).batch(args.batch_size, drop_remainder=True)

print("hidden_num:{}, vocab_size:{}, label_size:{}".format(args.hidden_num, len(word2id), len(tag2id)))

#######################################################################################################

model = NerModel(hidden_num = args.hidden_num, vocab_size = len(word2id)+1, label_size= len(tag2id), embedding_size = args.embedding_size)
optimizer = tf.keras.optimizers.Adam(args.lr)


ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)
ckpt.restore(tf.train.latest_checkpoint(args.output_dir))
ckpt_manager = tf.train.CheckpointManager(ckpt,args.output_dir,checkpoint_name='model.ckpt',max_to_keep=3)

#########################################################################################################

# @tf.function
def train_one_step(text_batch, labels_batch):
  with tf.GradientTape() as tape:
      logits, text_lens, log_likelihood = model(text_batch, labels_batch,training=True)
      loss = - tf.reduce_mean(log_likelihood)
  gradients = tape.gradient(loss, model.trainable_variables)
예제 #3
0
import tensorflow as tf
from model import NerModel
from utils import tokenize,read_vocab,format_result
import tensorflow_addons as tf_ad
from args_help import args
import json


vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
text_sequences ,label_sequences= tokenize(args.test_path,vocab2id,tag2id)



optimizer = tf.keras.optimizers.Adam(args.lr)
model = NerModel(hidden_num = args.hidden_num, vocab_size =len(vocab2id), label_size = len(tag2id), embedding_size = args.embedding_size)
# restore model
ckpt = tf.train.Checkpoint(optimizer=optimizer,model=model)
ckpt.restore(tf.train.latest_checkpoint(args.output_dir))


while True:
    text = input("input:")
    dataset = tf.keras.preprocessing.sequence.pad_sequences([[vocab2id.get(char,0) for char in text]], padding='post')
    print(dataset)
    logits, text_lens = model.predict(dataset)
    paths = []
    for logit, text_len in zip(logits, text_lens):
        viterbi_path, _ = tf_ad.text.viterbi_decode(logit[:text_len], model.transition_params)
        paths.append(viterbi_path)
    print(paths[0])
예제 #4
0
parser.add_argument("--embedding_size", type=int, default=32,help="embedding dim")
parser.add_argument("--output_dir", type=str, default='./checkpoint',help="output dir")
parser.add_argument("--lr", type=float, default=1e-3,help="lr")
parser.add_argument("--batch_size", type=int, default=64,help="lr")
args = parser.parse_args()

gpus=tf.config.experimental.list_physical_devices(device_type='GPU')
tf.config.experimental.set_visible_devices(devices=gpus[2], device_type='GPU')
vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
text_sequences, text_lens ,label_sequences= tokenize_pred(args.test_file,vocab2id,tag2id)
train_dataset = tf.data.Dataset.from_tensor_slices((text_sequences, text_lens, label_sequences))
train_dataset = train_dataset.shuffle(len(text_sequences)).batch(args.batch_size, drop_remainder=True)

optimizer = tf.keras.optimizers.Adam(args.lr)
model = NerModel(hidden_num = args.hidden_num, vocab_size =len(vocab2id), label_size = len(tag2id), embedding_size = args.embedding_size)
# restore model
ckpt = tf.train.Checkpoint(optimizer=optimizer,model=model)
ckpt.restore(tf.train.latest_checkpoint(args.output_dir))

for text_batch, text_lens,labels_batch in train_dataset:
    logits, _ = model.predict(text_batch)
    paths = []
    for logit, text_len, labels in zip(logits, text_lens, labels_batch):
        viterbi_path, _ = tf_ad.text.viterbi_decode(logit[:text_len], model.transition_params)
        paths.append(viterbi_path)

    for i in range(len(text_batch)):
        res = {'text':[],'pred':[],'label':[]}
        for j,t in enumerate(paths[i]):
            res['text'].append(id2vocab.get(text_batch[i][j].numpy(),'<UKN>'))
예제 #5
0
tag2id, id2tag = read_vocab(args.tag_file)
print(id2tag)
text_sequences, label_sequences, text_origin, label_origin = tokenize(
    args.test_path, vocab2id, tag2id)
# text_sequences 的维度是(159,110)
embedded_matrix = build_embedding_matrix(args.pretrain_embedding_vec, vocab2id)

# print('查看 text_sequences 的值和维度:')
# print(text_sequences.shape)
# print(type(text_sequences))

# 载入模型
optimizer = tf.keras.optimizers.Adam(args.lr)
model = NerModel(hidden_num=args.hidden_num,
                 vocab_size=len(vocab2id),
                 label_size=len(tag2id),
                 embedding_size=args.embedding_size,
                 embedding_matrix=embedded_matrix)
# restore model
ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)
ckpt.restore(tf.train.latest_checkpoint(args.output_dir))


def evaluationMetrics(id2tag, logits_batch, labels_batch):
    """
    (待加入模型)添加 presicion和 recall 作为测试集的评估方式
    logits_batch 表示预测值(单位为batch)
    labels_batch 表示真实值(单位为batch)
    """
    entity = []  # 真实的实体
    pre_entity = []  # 预测的实体
예제 #6
0
# 获取训练数据
train_dataset = tf.data.Dataset.from_tensor_slices((text_sequences, label_sequences))
train_dataset = train_dataset.shuffle(len(text_sequences)).batch(args.batch_size, drop_remainder=True)  # Tensor序列中最后少于一个batch数量的不要了

# # 测试代码,用后即删
# for _, (text_batch, labels_batch) in enumerate(train_dataset):
#     print(type(text_batch))
#     print(text_batch.shape)
#     print(text_batch)
#     break


# 构建模型
logger.info("hidden_num:{}, vocab_size:{}, label_size:{}".format(args.hidden_num, len(vocab2id), len(tag2id)))
model = NerModel(hidden_num = args.hidden_num, vocab_size = len(vocab2id), label_size= len(tag2id),
                 embedding_size = args.embedding_size, embedding_matrix=embedded_matrix)
optimizer = tf.keras.optimizers.Adam(args.lr)

ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)
ckpt.restore(tf.train.latest_checkpoint(args.output_dir))
ckpt_manager = tf.train.CheckpointManager(ckpt,
                                          args.output_dir,
                                          checkpoint_name='model.ckpt',
                                          max_to_keep=3)


# @tf.function
def train_one_step(text_batch, labels_batch):
    with tf.GradientTape() as tape:
        logits, text_lens, log_likelihood = model(text_batch, labels_batch, training=True)  # 此时调用model的call方法
        # log_likelihood 即表示loss结果
예제 #7
0
def get_model():
    global __model
    if not __model:
        __model = NerModel()
    return __model
예제 #8
0
#!/usr/bin/env python
# encoding: utf-8
'''
@author: Ben
@license: (C) Copyright 2013-2017, Node Supply Chain Manager Corporation Limited.
@contact: [email protected]
@file: keras_run.py
@time: 2019/8/15 09:42
@desc:
'''

from model import NerModel
from utils import *

if __name__ == '__main__':
    log.i('Start main function.')

    model = NerModel()
    model.train() if is_train() else model.predict()

    log.i('Process finish')
예제 #9
0
    logger.info("vocab file exits!!")

vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
text_sequences, label_sequences = tokenize(args.train_path, vocab2id, tag2id)

train_dataset = tf.data.Dataset.from_tensor_slices(
    (text_sequences, label_sequences))
train_dataset = train_dataset.shuffle(len(text_sequences)).batch(
    args.batch_size, drop_remainder=True)

logger.info("hidden_num:{}, vocab_size:{}, label_size:{}".format(
    args.hidden_num, len(vocab2id), len(tag2id)))

model = NerModel(hidden_num=args.hidden_num,
                 vocab_size=len(vocab2id),
                 label_size=len(tag2id),
                 embedding_size=args.embedding_size)

model.compile(optimizer=tf.keras.optimizers.Adam(args.lr),
              # loss=
              )

ckpt = tf.train.Checkpoint(optimizer=model.optimizer, model=model)
ckpt.restore(tf.train.latest_checkpoint(args.output_dir))
ckpt_manager = tf.train.CheckpointManager(ckpt,
                                          args.output_dir,
                                          checkpoint_name='model.ckpt',
                                          max_to_keep=3)

model.fit(train_dataset, batch_size=args.batch_size, epoch=args.epoch)
# @tf.function