Пример #1
0
    def __init_model(self, entry):
        if entry == 'train':
            self.train_manager = NERDataset(model_path=self.model_path,
                                            data_path='data/ner_train.txt',
                                            data_type='train',
                                            tags=self.tags,
                                            max_len=self.embedding_size,
                                            batch_size=self.batch_size)
            self.train_manager.dump_data_map()
            self.total_size = (len(self.train_manager) + self.batch_size -
                               1) // self.batch_size
            dev_manager = NERDataset(model_path=self.model_path,
                                     data_path='data/ner_test.txt',
                                     data_type='dev',
                                     tags=self.tags,
                                     max_len=self.embedding_size,
                                     batch_size=self.batch_size)
            self.dev_batch = dev_manager.batch_iter()

            self.model = BiLSTMCRF(
                self.device,
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == 'predict':
            data_map = self.load_params()
            self.tag_map = data_map.get('tag_map')
            self.vocab = data_map.get('vocab')
            self.model = BiLSTMCRF(self.device,
                                   tag_map=self.tag_map,
                                   vocab_size=len(self.vocab.items()),
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()
        self.model.to(self.device)
Пример #2
0
class NERModel(object):
    def __init__(self, device, entry='train'):
        self.device = device
        self.load_config()
        self.__init_model(entry)

    def __init_model(self, entry):
        if entry == 'train':
            self.train_manager = NERDataset(model_path=self.model_path,
                                            data_path='data/ner_train.txt',
                                            data_type='train',
                                            tags=self.tags,
                                            max_len=self.embedding_size,
                                            batch_size=self.batch_size)
            self.train_manager.dump_data_map()
            self.total_size = (len(self.train_manager) + self.batch_size -
                               1) // self.batch_size
            dev_manager = NERDataset(model_path=self.model_path,
                                     data_path='data/ner_test.txt',
                                     data_type='dev',
                                     tags=self.tags,
                                     max_len=self.embedding_size,
                                     batch_size=self.batch_size)
            self.dev_batch = dev_manager.batch_iter()

            self.model = BiLSTMCRF(
                self.device,
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == 'predict':
            data_map = self.load_params()
            self.tag_map = data_map.get('tag_map')
            self.vocab = data_map.get('vocab')
            self.model = BiLSTMCRF(self.device,
                                   tag_map=self.tag_map,
                                   vocab_size=len(self.vocab.items()),
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()
        self.model.to(self.device)

    def load_config(self):
        try:
            fopen = open('config/ner_config.yml')
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            logger.warning(f'Load config failed, using default config {error}')
            with open('config/ner_config.yml', 'w') as fopen:
                config = {
                    'embedding_size': 200,
                    'hidden_size': 128,
                    'batch_size': 128,
                    'dropout': 0.5,
                    'model_path': 'model/',
                    'tags': ['ORG', 'PER', 'LOC', 'COM']
                }
                yaml.dump(config, fopen)
        self.embedding_size = config.get('embedding_size')
        self.hidden_size = config.get('hidden_size')
        self.batch_size = config.get('batch_size')
        self.model_path = config.get('model_path')
        self.tags = config.get('tags')
        self.dropout = config.get('dropout')

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(os.path.join(self.model_path, 'params.pkl')))
            logger.info('model restore success!')
        except Exception as error:
            logger.warn(f'model restore faild! {error}')

    def load_params(self):
        with codecs.open('ner_model/data.pkl', 'rb') as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        epoch_num = 1
        for epoch in range(epoch_num):
            progress = tqdm(self.train_manager.batch_iter(),
                            desc=f'NER Epoch#{epoch + 1}/{epoch_num}',
                            total=self.total_size,
                            dynamic_ncols=True)
            for batch in progress:
                self.model.zero_grad()
                sentences, tags = zip(*batch)
                sentences_tensor = torch.tensor(
                    sentences, dtype=torch.long).to(self.device)
                tags_tensor = torch.tensor(tags,
                                           dtype=torch.long).to(self.device)
                trained_tags = self.model(sentences_tensor)
                loss = -self.model.crf(trained_tags,
                                       tags_tensor)  # neg_log_likelihood
                progress.set_postfix({
                    'loss': loss.item(),
                })
                loss.backward()
                optimizer.step()
            torch.save(self.model.state_dict(), self.model_path + 'params.pkl')

    def evaluate(self):
        sentences, labels = zip(*self.dev_batch.__next__())
        _, paths = self.model(sentences)
        for tag in self.tags:
            pass
            # f1_score(labels, paths, tag, self.model.tag_map)

    def predict(self, input_str=''):
        if not input_str:
            input_str = input('请输入文本: ')
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec).to(self.device).view(1, -1)
        # _, paths = self.model(sentences)
        id2tag = [
            k for (k, v) in sorted(self.tag_map.items(), key=lambda x: x[1])
        ]
        results = {}
        for tag in id2tag:
            results.update({tag.split('-')[-1]: []})
        trained_tags = self.model(sentences)
        entities = self.model.crf.decode(trained_tags)
        tags = list(map(lambda x: id2tag[x[0]], entities))
        return tags