def __init_model(self, entry): if entry == 'train': self.train_manager = NERDataset(model_path=self.model_path, data_path='data/ner_train.txt', data_type='train', tags=self.tags, max_len=self.embedding_size, batch_size=self.batch_size) self.train_manager.dump_data_map() self.total_size = (len(self.train_manager) + self.batch_size - 1) // self.batch_size dev_manager = NERDataset(model_path=self.model_path, data_path='data/ner_test.txt', data_type='dev', tags=self.tags, max_len=self.embedding_size, batch_size=self.batch_size) self.dev_batch = dev_manager.batch_iter() self.model = BiLSTMCRF( self.device, tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == 'predict': data_map = self.load_params() self.tag_map = data_map.get('tag_map') self.vocab = data_map.get('vocab') self.model = BiLSTMCRF(self.device, tag_map=self.tag_map, vocab_size=len(self.vocab.items()), embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model() self.model.to(self.device)
def predict(token_vocab, target_vocab, sent): os.environ[ 'CUDA_VISIBLE_DEVICES'] = '-1' # force to use cpu only (prediction) model_dir = "./trained_models" # prepare sentence converting # to make raw sentence to id data easily pred_data = N2NTextData(sent, mode='sentence') pred_id_data = N2NConverter.convert(pred_data, target_vocab, token_vocab) pred_data_set = NERDataset(pred_id_data, 1, 128) # a_batch_data = next(pred_data_set.predict_iterator) # a result b_nes_id, b_token_ids, b_weight = a_batch_data # Restore graph # note that frozen_graph.tf.pb contains graph definition with parameter values in binary format _graph_fn = os.path.join(model_dir, 'frozen_graph.tf.pb') with tf.gfile.GFile(_graph_fn, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) with tf.Graph().as_default() as graph: tf.import_graph_def(graph_def) with tf.Session(graph=graph) as sess: # to check load graph #for n in tf.get_default_graph().as_graph_def().node: print(n.name) # make interface for input pl_token = graph.get_tensor_by_name('import/model/pl_tokens:0') pl_weight = graph.get_tensor_by_name('import/model/pl_weight:0') pl_keep_prob = graph.get_tensor_by_name('import/model/pl_keep_prob:0') # make interface for output step_out_preds = graph.get_tensor_by_name( 'import/model/step_out_preds:0') step_out_probs = graph.get_tensor_by_name( 'import/model/step_out_probs:0') # predict sentence b_best_step_pred_indexs, b_step_pred_probs = sess.run( [step_out_preds, step_out_probs], feed_dict={ pl_token: b_token_ids, pl_weight: b_weight, pl_keep_prob: 1.0, }) best_step_pred_indexs = b_best_step_pred_indexs[0] step_pred_probs = b_step_pred_probs[0] step_best_targets = [] step_best_target_probs = [] for time_step, best_pred_index in enumerate(best_step_pred_indexs): _target_class = target_vocab.get_symbol(best_pred_index) step_best_targets.append(_target_class) _prob = step_pred_probs[time_step][best_pred_index] step_best_target_probs.append(_prob) for idx, char in enumerate(list(sent)): print('{}\t{}\t{}'.format(char, step_best_targets[idx], step_best_target_probs[idx]))
def train(train_id_data, num_vocabs, num_taget_class): # # train sentiment analysis using given train_id_data # max_epoch = 300 model_dir = "./trained_models" hps = NER.get_default_hparams() hps.update(batch_size=100, num_steps=128, emb_size=50, enc_dim=100, vocab_size=num_vocabs, num_target_class=num_taget_class) with tf.variable_scope("model"): model = NER(hps, "train") sv = tf.train.Supervisor(is_chief=True, logdir=model_dir, summary_op=None, global_step=model.global_step) # tf assign compatible operators for gpu and cpu tf_config = tf.ConfigProto(allow_soft_placement=True) with sv.managed_session(config=tf_config) as sess: local_step = 0 prev_global_step = sess.run(model.global_step) train_data_set = NERDataset(train_id_data, hps.batch_size, hps.num_steps) losses = [] while not sv.should_stop(): fetches = [model.global_step, model.loss, model.train_op] a_batch_data = next(train_data_set.iterator) y, x, w = a_batch_data fetched = sess.run( fetches, { model.x: x, model.y: y, model.w: w, model.keep_prob: hps.keep_prob, }) local_step += 1 _global_step = fetched[0] _loss = fetched[1] losses.append(_loss) if local_step < 10 or local_step % 10 == 0: epoch = train_data_set.get_epoch_num() print("Epoch = {:3d} Step = {:7d} loss = {:5.3f}".format( epoch, _global_step, np.mean(losses))) _loss = [] if epoch >= max_epoch: break print("Training is done.") sv.stop() # model.out_pred, model.out_probs freeze_graph( model_dir, "model/step_out_preds,model/step_out_probs", "frozen_graph.tf.pb") ## freeze graph with params to probobuf format
pl_keep_prob: 1.0, }) best_step_pred_indexs = b_best_step_pred_indexs[0] step_pred_probs = b_step_pred_probs[0] step_best_targets = [] step_best_target_probs = [] for time_step, best_pred_index in enumerate(best_step_pred_indexs): _target_class = target_vocab.get_symbol(best_pred_index) step_best_targets.append(_target_class) _prob = step_pred_probs[time_step][best_pred_index] step_best_target_probs.append(_prob) for idx, char in enumerate(list(sent)): print('{}\t{}\t{}'.format(char, step_best_targets[idx], step_best_target_probs[idx])) if __name__ == '__main__': train_id_data, token_vocab, target_vocab = load_data() num_vocabs = token_vocab.get_num_tokens() num_target_class = target_vocab.get_num_targets() train_data_set = NERDataset(train_id_data, 5, 128) train(train_id_data, num_vocabs, num_target_class) predict(token_vocab, target_vocab, '의정지기단은 첫 사업으로 45 명 시의원들의 선거 공약을 수집해 개인별로 카드를 만들었다.') predict(token_vocab, target_vocab, '한국소비자보호원은 19일 시판중인 선물세트의 상당수가 과대 포장된 것으로 드러났다고 밝혔다.')
return (Variable( weight.new(self.n_layers, batch_size, self.hidden_dim).uniform_()), Variable( weight.new(self.n_layers, batch_size, self.hidden_dim).uniform_())) if __name__ == "__main__": EMBEDDING_DIM = 100 HIDDEN_DIM = 100 BATCH_SIZE = 256 vocab = build_vocab('data') word_vocab, label_vocab = vocab train_dataset = NERDataset('data', vocab, type='/train') train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=2, collate_fn=custom_collate, shuffle=True) sample_data, sample_target, sample_len = next(iter(train_loader)) sample_data = sample_data.long() model = RNN(EMBEDDING_DIM, HIDDEN_DIM, len(word_vocab), len(label_vocab)) hidden = model.init_hidden(BATCH_SIZE) with torch.no_grad(): tag_scores = model(sample_data, hidden) print(tag_scores.shape)
class NERModel(object): def __init__(self, device, entry='train'): self.device = device self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == 'train': self.train_manager = NERDataset(model_path=self.model_path, data_path='data/ner_train.txt', data_type='train', tags=self.tags, max_len=self.embedding_size, batch_size=self.batch_size) self.train_manager.dump_data_map() self.total_size = (len(self.train_manager) + self.batch_size - 1) // self.batch_size dev_manager = NERDataset(model_path=self.model_path, data_path='data/ner_test.txt', data_type='dev', tags=self.tags, max_len=self.embedding_size, batch_size=self.batch_size) self.dev_batch = dev_manager.batch_iter() self.model = BiLSTMCRF( self.device, tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == 'predict': data_map = self.load_params() self.tag_map = data_map.get('tag_map') self.vocab = data_map.get('vocab') self.model = BiLSTMCRF(self.device, tag_map=self.tag_map, vocab_size=len(self.vocab.items()), embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model() self.model.to(self.device) def load_config(self): try: fopen = open('config/ner_config.yml') config = yaml.load(fopen) fopen.close() except Exception as error: logger.warning(f'Load config failed, using default config {error}') with open('config/ner_config.yml', 'w') as fopen: config = { 'embedding_size': 200, 'hidden_size': 128, 'batch_size': 128, 'dropout': 0.5, 'model_path': 'model/', 'tags': ['ORG', 'PER', 'LOC', 'COM'] } yaml.dump(config, fopen) self.embedding_size = config.get('embedding_size') self.hidden_size = config.get('hidden_size') self.batch_size = config.get('batch_size') self.model_path = config.get('model_path') self.tags = config.get('tags') self.dropout = config.get('dropout') def restore_model(self): try: self.model.load_state_dict( torch.load(os.path.join(self.model_path, 'params.pkl'))) logger.info('model restore success!') except Exception as error: logger.warn(f'model restore faild! {error}') def load_params(self): with codecs.open('ner_model/data.pkl', 'rb') as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) epoch_num = 1 for epoch in range(epoch_num): progress = tqdm(self.train_manager.batch_iter(), desc=f'NER Epoch#{epoch + 1}/{epoch_num}', total=self.total_size, dynamic_ncols=True) for batch in progress: self.model.zero_grad() sentences, tags = zip(*batch) sentences_tensor = torch.tensor( sentences, dtype=torch.long).to(self.device) tags_tensor = torch.tensor(tags, dtype=torch.long).to(self.device) trained_tags = self.model(sentences_tensor) loss = -self.model.crf(trained_tags, tags_tensor) # neg_log_likelihood progress.set_postfix({ 'loss': loss.item(), }) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def evaluate(self): sentences, labels = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) for tag in self.tags: pass # f1_score(labels, paths, tag, self.model.tag_map) def predict(self, input_str=''): if not input_str: input_str = input('请输入文本: ') input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).to(self.device).view(1, -1) # _, paths = self.model(sentences) id2tag = [ k for (k, v) in sorted(self.tag_map.items(), key=lambda x: x[1]) ] results = {} for tag in id2tag: results.update({tag.split('-')[-1]: []}) trained_tags = self.model(sentences) entities = self.model.crf.decode(trained_tags) tags = list(map(lambda x: id2tag[x[0]], entities)) return tags
tags_pred = model.decode(seqs.to(device), masks.to(device)) for tp in tags_pred: y_pred.append([ix_to_tag[ix] for ix in tp]) # true lens = masks.sum(0).tolist() tags_l = tags.t().tolist() for t, ln in zip(tags_l, lens): y_true.append([ix_to_tag[ix] for ix in t[:ln]]) return score(y_true, y_pred) if __name__ == "__main__": data_dir = f"data/{args.dataset}/processed" # Load dataset train_data = NERDataset(os.path.join(data_dir, "train.pkl")) test_data = NERDataset(os.path.join(data_dir, "test.pkl")) dev_data = NERDataset(os.path.join(data_dir, "dev.pkl")) # Load vocabs word_to_ix = load_obj(os.path.join(data_dir, "word_to_ix.pkl")) tag_to_ix = load_obj(os.path.join(data_dir, "tag_to_ix.pkl")) ix_to_tag = {v: k for k, v in tag_to_ix.items()} # DataLoaders train_loader = DataLoader( train_data, batch_size=args.batch_size, collate_fn=BatchPadding(), shuffle=True,
if __name__ == '__main__': EMBEDDING_DIM = 100 HIDDEN_DIM = 64 BATCH_SIZE = 64 EPOCH = 20 LR_RATE = 1e-4 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") writer = SummaryWriter() writer.flush() # Create train dataloader vocab = build_vocab('data') word_vocab, label_vocab = vocab train_dataset = NERDataset('data', vocab, type='/train') train_loader = DataLoader(train_dataset, batch_size=128, num_workers=2, collate_fn=custom_collate, shuffle=True) val_dataset = NERDataset('data', vocab, type='/val') val_loader = DataLoader(val_dataset, batch_size=128, num_workers=2, collate_fn=custom_collate, shuffle=True) # Model initialisation model = RNN(EMBEDDING_DIM, HIDDEN_DIM, len(word_vocab), len(label_vocab)) model.to(device)