def __init_model(self, entry): if entry == 'train': self.train_manager = NERDataset(model_path=self.model_path, data_path='data/ner_train.txt', data_type='train', tags=self.tags, max_len=self.embedding_size, batch_size=self.batch_size) self.train_manager.dump_data_map() self.total_size = (len(self.train_manager) + self.batch_size - 1) // self.batch_size dev_manager = NERDataset(model_path=self.model_path, data_path='data/ner_test.txt', data_type='dev', tags=self.tags, max_len=self.embedding_size, batch_size=self.batch_size) self.dev_batch = dev_manager.batch_iter() self.model = BiLSTMCRF( self.device, tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == 'predict': data_map = self.load_params() self.tag_map = data_map.get('tag_map') self.vocab = data_map.get('vocab') self.model = BiLSTMCRF(self.device, tag_map=self.tag_map, vocab_size=len(self.vocab.items()), embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model() self.model.to(self.device)
class NERModel(object): def __init__(self, device, entry='train'): self.device = device self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == 'train': self.train_manager = NERDataset(model_path=self.model_path, data_path='data/ner_train.txt', data_type='train', tags=self.tags, max_len=self.embedding_size, batch_size=self.batch_size) self.train_manager.dump_data_map() self.total_size = (len(self.train_manager) + self.batch_size - 1) // self.batch_size dev_manager = NERDataset(model_path=self.model_path, data_path='data/ner_test.txt', data_type='dev', tags=self.tags, max_len=self.embedding_size, batch_size=self.batch_size) self.dev_batch = dev_manager.batch_iter() self.model = BiLSTMCRF( self.device, tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == 'predict': data_map = self.load_params() self.tag_map = data_map.get('tag_map') self.vocab = data_map.get('vocab') self.model = BiLSTMCRF(self.device, tag_map=self.tag_map, vocab_size=len(self.vocab.items()), embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model() self.model.to(self.device) def load_config(self): try: fopen = open('config/ner_config.yml') config = yaml.load(fopen) fopen.close() except Exception as error: logger.warning(f'Load config failed, using default config {error}') with open('config/ner_config.yml', 'w') as fopen: config = { 'embedding_size': 200, 'hidden_size': 128, 'batch_size': 128, 'dropout': 0.5, 'model_path': 'model/', 'tags': ['ORG', 'PER', 'LOC', 'COM'] } yaml.dump(config, fopen) self.embedding_size = config.get('embedding_size') self.hidden_size = config.get('hidden_size') self.batch_size = config.get('batch_size') self.model_path = config.get('model_path') self.tags = config.get('tags') self.dropout = config.get('dropout') def restore_model(self): try: self.model.load_state_dict( torch.load(os.path.join(self.model_path, 'params.pkl'))) logger.info('model restore success!') except Exception as error: logger.warn(f'model restore faild! {error}') def load_params(self): with codecs.open('ner_model/data.pkl', 'rb') as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) epoch_num = 1 for epoch in range(epoch_num): progress = tqdm(self.train_manager.batch_iter(), desc=f'NER Epoch#{epoch + 1}/{epoch_num}', total=self.total_size, dynamic_ncols=True) for batch in progress: self.model.zero_grad() sentences, tags = zip(*batch) sentences_tensor = torch.tensor( sentences, dtype=torch.long).to(self.device) tags_tensor = torch.tensor(tags, dtype=torch.long).to(self.device) trained_tags = self.model(sentences_tensor) loss = -self.model.crf(trained_tags, tags_tensor) # neg_log_likelihood progress.set_postfix({ 'loss': loss.item(), }) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def evaluate(self): sentences, labels = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) for tag in self.tags: pass # f1_score(labels, paths, tag, self.model.tag_map) def predict(self, input_str=''): if not input_str: input_str = input('请输入文本: ') input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).to(self.device).view(1, -1) # _, paths = self.model(sentences) id2tag = [ k for (k, v) in sorted(self.tag_map.items(), key=lambda x: x[1]) ] results = {} for tag in id2tag: results.update({tag.split('-')[-1]: []}) trained_tags = self.model(sentences) entities = self.model.crf.decode(trained_tags) tags = list(map(lambda x: id2tag[x[0]], entities)) return tags