class Predictor(object): def __init__(self, args): pretrain_name = 'bert-base-cased' if args.model_info.bert_path: pretrain_name = args.model_info.bert_path self.tokenizer = BertTokenizer.from_pretrained(pretrain_name) print(f"Tokenizer from:{pretrain_name}") train_conf = args.train_info model_conf = args.model_info self.device = train_conf.device self.class_num = model_conf.class_num self.model = BertClassifier(model_conf) self.model.load_state_dict( torch.load(train_conf.model_path, map_location=torch.device(self.device))) self.model.to(self.device) self.lr = train_conf.lr self.max_len = train_conf.max_seq_len self.conf = args self.label_map = json.load(open(args.label_map_path)) self.id2label = dict([(i, label_str) for label_str, i in self.label_map.items()]) self.softmax = Softmax(dim=1) def predict(self, sens): d_loader = self.sen_2_dl(sens) y_pred = list() with torch.no_grad(): for batch in d_loader: input_ids = batch['input_ids'].to(self.device) attention_mask = batch['attention_mask'].to(self.device) logits = self.model(input_ids, attention_mask) logits = torch.sigmoid(logits) y_pred.append(logits) y_pred = torch.cat(y_pred) y_pred = y_pred.cpu().numpy() res = list() for y in y_pred: res.append(self._score_2_dict(y)) return res def _score_2_dict(self, single_pred): res = dict() for i, score in enumerate(single_pred): label_str = self.id2label[i] res[label_str] = float(score) return res def sen_2_dl(self, sens): texts = [i.strip() for i in sens] labels = [ 999 ] # this is a invalid parameter but dataloader needs the this ds = SentimentDataset(self.tokenizer, texts, labels, self.max_len) _loader = dataloader.DataLoader( ds, batch_size=self.conf.train_info.batch_size, shuffle=False) return _loader
class Evaluator(object): def __init__(self, args): pretrain_name = 'bert-base-cased' if args.model_info.bert_path: pretrain_name = args.model_info.bert_path print(f"Tokenizer from:{pretrain_name}") train_conf = args.train_info model_conf = args.model_info self.model_type = model_conf.model if self.model_type == 'bert_seq': self.model = BertClassifier(model_conf) self.tokenizer = BertTokenizer.from_pretrained(pretrain_name) self.ds = SentimentDataset if self.model_type == 'GPT2': self.model = GPT2Classifier(model_conf) self.tokenizer = GPT2Tokenizer.from_pretrained(pretrain_name) self.ds = GPT2Dataset self.model.load_state_dict(torch.load(train_conf.model_path)) self.device = train_conf.device self.class_num = model_conf.class_num self.model.to(self.device) self.lr = train_conf.lr self.max_len = train_conf.max_seq_len self.conf = args self.label_map = json.load(open(args.label_map_path)) self.id2label = dict([(i, label_str) for label_str, i in self.label_map.items()]) def run(self, batch_size=64): test_path = self.conf.train_info.test_path test_loader = self.get_data_loader(test_path, batch_size) acc, recall, f1_score, cm, report, res = self.evaluate(test_loader) print(f"Accuracy score of the model is {acc}") print(f"Recall score of the model is {recall}") print(f"F1 score of the model is {f1_score}") print(f"Confusion matrix of the model is {cm}") print(report) dir_ = os.path.dirname(test_path) dir_ = os.path.dirname(dir_) dir_ = os.path.split(dir_)[0] new_path = os.path.join(dir_, 'logs', 'bad_case.json') f = open(new_path, 'w') for i in res: print(json.dumps(i, ensure_ascii=False), file=f) def evaluate(self, _loader): self.model.eval() y_true = list() y_pred = list() res = [] with torch.no_grad(): for batch in _loader: input_ids = batch['input_ids'].to(self.device) attention_mask = batch['attention_mask'].to(self.device) y = batch['labels'] y = torch.squeeze(y, 1) y = y.to(self.device) logits = self.model(input_ids, attention_mask) y_true.append(y) y_pred.append(logits) pred_labels = torch.argmax(logits, dim=1) preds = pred_labels.cpu().numpy() true = batch['labels'].squeeze().numpy() if len(true) < 1: continue for i, c_y in enumerate(true): if c_y != preds[i]: tmp_dict = { 'true_label': self.id2label[c_y], 'pred_label': self.id2label[preds[i]], 'text': batch['text'][i] } res.append(tmp_dict) y_true = torch.cat(y_true) y_pred = torch.cat(y_pred) cm = metrics.cal_cm(y_true, y_pred) acc_score = metrics.cal_accuracy(y_true, y_pred) recall = metrics.cal_recall(y_true, y_pred) f1_score = metrics.cal_f1(y_true, y_pred) label_range = [i for i in range(len(self.label_map))] target_name = [ x[0] for x in sorted(self.label_map.items(), key=lambda x: x[1]) ] report = metrics.get_classification_report(y_true, y_pred, label_range, target_name) return acc_score, recall, f1_score, cm, report, res def get_data_loader(self, f_path, batch_size): np.random.seed(14) texts, labels = prepare(f_path, self.label_map) ds = self.ds(self.tokenizer, texts, labels, self.max_len) return dataloader.DataLoader(ds, batch_size=batch_size, num_workers=self.conf.num_workers, shuffle=True)
total_count += gt.shape[0] total_loss.append(criterion(preds, labels).item()) loss, acc = np.array(total_loss).mean(), total_correct / total_count print("Average Loss: {:.6f}, Accuracy: {:.6f}".format(loss, acc)) return loss, acc device = 'cuda' if torch.cuda.is_available() else 'cpu' epochs = 30 best_acc = 0.0 eval_losses, eval_accs = [], [] train_losses, train_accs = [], [] model = BertClassifier(freeze_bert=False) model = model.to(device) # model = nn.DataParallel(model) train_dataset = EmojiDataset('../../data/train_bert_sentences.npy', '../../data/train_bert_labels.npy') train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn) test_dataset = EmojiDataset('../../data/test_bert_sentences.npy', '../../data/test_bert_labels.npy') test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn)
import pandas as pd labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经'] bert_config = BertConfig.from_pretrained('chinese_wwm_pytorch') bert_config.num_labels = len(labels) model = BertClassifier(bert_config) model.load_state_dict( torch.load('./best_model_on_trainset.pkl', map_location=torch.device('cpu'))) tokenizer = BertTokenizer(vocab_file='chinese_wwm_pytorch/vocab.txt') device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu') model = torch.nn.DataParallel(model, device_ids=[2]) model.to(device) def predict_text(text): input_id, attention_mask, token_type_id = get_bert_input(text, tokenizer) input_id = torch.tensor([input_id], dtype=torch.long) attention_mask = torch.tensor([attention_mask], dtype=torch.long) token_type_id = torch.tensor([token_type_id], dtype=torch.long) predicted = model( input_id, attention_mask, token_type_id, ) pred_label = torch.argmax(predicted, dim=1)
attention_mask = (x != 0).float().to(config.DEVICE).long() outputs = MODEL(x, attention_mask=attention_mask) return outputs.cpu().detach().numpy() @app.route('/predict') def predict(): comment = request.args.get('comment') start_time = time.time() prediction = comment_prediction(comment) response = { 'response': { label: str(prob) for label, prob in zip(config.CLASS_COLS, prediction[0]) } } response['response']['comment'] = comment response['response']['time_taken'] = str(time.time() - start_time) return flask.jsonify(response) if __name__ == '__main__': bert_config = BertConfig.from_pretrained(config.BERT_NAME) bert_config.num_labels = config.NUM_CLASSES MODEL = BertClassifier(bert_config) MODEL.load_state_dict(torch.load(config.TRAINED_MODEL_PATH)) MODEL.to(config.DEVICE) MODEL.eval() app.run(host=config.HOST, port=config.PORT)