def __init__(self, **kwargs): self.tokenizer = tokenization.FullTokenizer( vocab_file=kwargs['vocab_file'], do_lower_case=True) self.max_seq_len = 70 self.ckpt_path = kwargs['model_dir'] self.init_checkpoint = kwargs['init_checkpoint_file'] self.bert_config = kwargs['bert_config_dir'] self.graph = kwargs["graph"] with self.graph.as_default(): self.model = Model(init_checkpoint_file=self.init_checkpoint, bert_config_dir=self.bert_config) self.saver = tf.train.Saver() config = tf.ConfigProto(log_device_placement=False) self.session = tf.Session(graph=self.graph, config=config) self.load()
def __init__(self): self.lstm_dim = 128 self.batch_size = 1 self.max_seq_len = 70 self.clip = 5.0 self.dropout_keep = 0.5 self.optimizer = 'adam' self.lr = 0.001 self.tag_schema = 'iob' self.ckpt_path = '..\\models' self.steps_check = 10 self.zeros = False self.lower = True self.max_epoch = 2 self.num_tags = len(convert_samples.tag_to_id) self.model = Model(init_checkpoint_file='D:\models\\albert_base_zh\\albert_model.ckpt' , bert_config_dir='D:\models\\albert_base_zh\\albert_config_base.json') self.saver = tf.train.Saver() self.tokenizer = tokenization.FullTokenizer(vocab_file='D:\models\\albert_base_zh\\vocab.txt', do_lower_case=True)
def __init__(self, **kwargs): self.tokenizer = tokenization.FullTokenizer( vocab_file=kwargs['vocab_file'], # vocab_file='D:\\mygit\\NER_MODEL\\albert_tiny_489k\\vocab.txt', do_lower_case=True) self.max_seq_len = 70 self.ckpt_path = kwargs['model_dir'] # self.ckpt_path = 'D:\\mygit\\NER_MODEL\\models' self.init_checkpoint = kwargs['init_checkpoint_file'] # self.init_checkpoint = 'D:\\mygit\\NER_MODEL\\albert_tiny_489k\\albert_model.ckpt' self.bert_config = kwargs['bert_config_dir'] # self.bert_config = 'D:\\mygit\\NER_MODEL\\albert_tiny_489k\\albert_config_tiny.json' self.graph = kwargs["graph"] with self.graph.as_default(): self.model = Model(init_checkpoint_file=self.init_checkpoint, bert_config_dir=self.bert_config) self.saver = tf.train.Saver() config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) self.session = tf.Session(graph=self.graph, config=config) self.load()
class modelTrain: def __init__(self): self.lstm_dim = 128 self.batch_size = 8 self.max_seq_len = 70 self.clip = 5.0 self.dropout_keep = 0.5 self.optimizer = 'adam' self.lr = 0.001 self.tag_schema = 'iob' self.ckpt_path = '..\\models' self.steps_check = 10 self.zeros = False self.lower = True self.max_epoch = 2 self.num_tags = len(convert_samples.tag_to_id) self.model = Model( init_checkpoint_file='D:\迅雷下载\\albert_tiny_489k\\albert_model.ckpt', bert_config_dir='D:\迅雷下载\\albert_tiny_489k\\albert_config_tiny.json' ) self.saver = tf.train.Saver() self.tokenizer = tokenization.FullTokenizer( vocab_file='D:\迅雷下载\\albert_tiny_489k\\vocab.txt', do_lower_case=True) def train(self): path = '..\data\\train.json' train_sentences = self.load_sentences(path) train_data = self.prepare_dataset(train_sentences, self.max_seq_len, self.lower) train_manager = BatchManager(train_data, self.batch_size) init = tf.global_variables_initializer() steps_per_epoch = train_manager.len_data with tf.Session() as sess: loss = [] sess.run(init) for i in range(self.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = self.model.run_step(sess, True, batch) loss.append(batch_loss) if step % self.steps_check == 0: iteration = step // steps_per_epoch + 1 print("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] self.save_model(sess, self.model, self.ckpt_path, global_steps=step) def load_sentences(self, path): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] num = 0 for j, line in enumerate(codecs.open(path, 'r', 'utf8')): sentence = [] num += 1 data = json.loads(line) list_lable = str(data['label']).split(' ') for i, value in enumerate(list(data['text'])): temp = [] temp.append(value) temp.append(list_lable[i]) sentence.append(temp) sentences.append(sentence) return sentences def save_model(self, sess, model, path, global_steps): checkpoint_path = os.path.join(path, "ner.ckpt") model.saver.save(sess, checkpoint_path, global_step=global_steps) def prepare_dataset(self, sentences, max_seq_length, lower=False, train=True): """ Prepare the dataset. Return a list of lists of dictionaries containing: - word indexes - word char indexes - tag indexes """ data = [] for s in sentences: if lower: string = [w[0].strip().lower() for w in s] else: string = [w[0].strip() for w in s] char_line = ' '.join(string) text = tokenization.convert_to_unicode(char_line) if train: tags = [w[-1] for w in s] else: tags = ['O' for _ in string] labels = ' '.join(tags) labels = tokenization.convert_to_unicode(labels) ids, mask, segment_ids, label_ids = self.convert_single_example( char_line=text, max_seq_length=max_seq_length, tokenizer=self.tokenizer, label_line=labels) data.append([string, segment_ids, ids, mask, label_ids]) return data def convert_single_example(self, char_line, max_seq_length, tokenizer, label_line): """ 将一个样本进行分析,然后将字转化为id, 标签转化为lb """ text_list = char_line.split(' ') label_list = label_line.split(' ') tokens = [] labels = [] for i, word in enumerate(text_list): token = tokenizer.tokenize(word) tokens.extend(token) label_1 = label_list[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: labels.append("X") # 序列截断 if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) # append("O") or append("[CLS]") not sure! label_ids.append(convert_samples.tag_to_id["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(convert_samples.tag_to_id[labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) # append("O") or append("[SEP]") not sure! label_ids.append(convert_samples.tag_to_id["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) # padding while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) ntokens.append("**NULL**") return input_ids, input_mask, segment_ids, label_ids
class bert_predict: def __init__(self, **kwargs): self.tokenizer = tokenization.FullTokenizer( vocab_file=kwargs['vocab_file'], # vocab_file='D:\\mygit\\NER_MODEL\\albert_tiny_489k\\vocab.txt', do_lower_case=True) self.max_seq_len = 70 self.ckpt_path = kwargs['model_dir'] # self.ckpt_path = 'D:\\mygit\\NER_MODEL\\models' self.init_checkpoint = kwargs['init_checkpoint_file'] # self.init_checkpoint = 'D:\\mygit\\NER_MODEL\\albert_tiny_489k\\albert_model.ckpt' self.bert_config = kwargs['bert_config_dir'] # self.bert_config = 'D:\\mygit\\NER_MODEL\\albert_tiny_489k\\albert_config_tiny.json' self.graph = kwargs["graph"] with self.graph.as_default(): self.model = Model(init_checkpoint_file=self.init_checkpoint, bert_config_dir=self.bert_config) self.saver = tf.train.Saver() config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) self.session = tf.Session(graph=self.graph, config=config) self.load() def load(self): ckpt = tf.train.get_checkpoint_state(self.ckpt_path) if ckpt is not None and ckpt.model_checkpoint_path: self.saver.restore(self.session, ckpt.model_checkpoint_path) else: raise Exception("load model failure...") def predict_batch(self, input_text): train_sentences = self.load_samples(input_text) train_data = convert_samples.prepare_dataset(train_sentences, self.max_seq_len, True) train_manager = BatchManager(train_data, len(input_text)) temp = [] results = self.model.evaluate(self.session, train_manager, convert_samples.id_to_tag) for i, v in enumerate(results): a = bio_to_json(input_text[i], v[1:-1]) temp.append(a) data_items = {} s = [] for j in temp: if len(j['entities']) > 0: for k in range(len(j['entities'])): value = j['entities'][k]['word'] data_items[j['entities'][k]['type']] = [value] s.append(data_items) return s def load_samples(self, datas): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] num = 0 for j, data in enumerate(datas): sentence = [] num += 1 list_lable = list((len(data)) * 'O') for i, value in enumerate(data): temp = [] temp.append(value) temp.append(list_lable[i]) sentence.append(temp) sentences.append(sentence) return sentences