class Predictor(object): def __init__(self, args): pretrain_name = 'bert-base-cased' if args.model_info.bert_path: pretrain_name = args.model_info.bert_path self.tokenizer = BertTokenizer.from_pretrained(pretrain_name) print(f"Tokenizer from:{pretrain_name}") train_conf = args.train_info model_conf = args.model_info self.device = train_conf.device self.class_num = model_conf.class_num self.model = BertClassifier(model_conf) self.model.load_state_dict( torch.load(train_conf.model_path, map_location=torch.device(self.device))) self.model.to(self.device) self.lr = train_conf.lr self.max_len = train_conf.max_seq_len self.conf = args self.label_map = json.load(open(args.label_map_path)) self.id2label = dict([(i, label_str) for label_str, i in self.label_map.items()]) self.softmax = Softmax(dim=1) def predict(self, sens): d_loader = self.sen_2_dl(sens) y_pred = list() with torch.no_grad(): for batch in d_loader: input_ids = batch['input_ids'].to(self.device) attention_mask = batch['attention_mask'].to(self.device) logits = self.model(input_ids, attention_mask) logits = torch.sigmoid(logits) y_pred.append(logits) y_pred = torch.cat(y_pred) y_pred = y_pred.cpu().numpy() res = list() for y in y_pred: res.append(self._score_2_dict(y)) return res def _score_2_dict(self, single_pred): res = dict() for i, score in enumerate(single_pred): label_str = self.id2label[i] res[label_str] = float(score) return res def sen_2_dl(self, sens): texts = [i.strip() for i in sens] labels = [ 999 ] # this is a invalid parameter but dataloader needs the this ds = SentimentDataset(self.tokenizer, texts, labels, self.max_len) _loader = dataloader.DataLoader( ds, batch_size=self.conf.train_info.batch_size, shuffle=False) return _loader
class Evaluator(object): def __init__(self, args): pretrain_name = 'bert-base-cased' if args.model_info.bert_path: pretrain_name = args.model_info.bert_path print(f"Tokenizer from:{pretrain_name}") train_conf = args.train_info model_conf = args.model_info self.model_type = model_conf.model if self.model_type == 'bert_seq': self.model = BertClassifier(model_conf) self.tokenizer = BertTokenizer.from_pretrained(pretrain_name) self.ds = SentimentDataset if self.model_type == 'GPT2': self.model = GPT2Classifier(model_conf) self.tokenizer = GPT2Tokenizer.from_pretrained(pretrain_name) self.ds = GPT2Dataset self.model.load_state_dict(torch.load(train_conf.model_path)) self.device = train_conf.device self.class_num = model_conf.class_num self.model.to(self.device) self.lr = train_conf.lr self.max_len = train_conf.max_seq_len self.conf = args self.label_map = json.load(open(args.label_map_path)) self.id2label = dict([(i, label_str) for label_str, i in self.label_map.items()]) def run(self, batch_size=64): test_path = self.conf.train_info.test_path test_loader = self.get_data_loader(test_path, batch_size) acc, recall, f1_score, cm, report, res = self.evaluate(test_loader) print(f"Accuracy score of the model is {acc}") print(f"Recall score of the model is {recall}") print(f"F1 score of the model is {f1_score}") print(f"Confusion matrix of the model is {cm}") print(report) dir_ = os.path.dirname(test_path) dir_ = os.path.dirname(dir_) dir_ = os.path.split(dir_)[0] new_path = os.path.join(dir_, 'logs', 'bad_case.json') f = open(new_path, 'w') for i in res: print(json.dumps(i, ensure_ascii=False), file=f) def evaluate(self, _loader): self.model.eval() y_true = list() y_pred = list() res = [] with torch.no_grad(): for batch in _loader: input_ids = batch['input_ids'].to(self.device) attention_mask = batch['attention_mask'].to(self.device) y = batch['labels'] y = torch.squeeze(y, 1) y = y.to(self.device) logits = self.model(input_ids, attention_mask) y_true.append(y) y_pred.append(logits) pred_labels = torch.argmax(logits, dim=1) preds = pred_labels.cpu().numpy() true = batch['labels'].squeeze().numpy() if len(true) < 1: continue for i, c_y in enumerate(true): if c_y != preds[i]: tmp_dict = { 'true_label': self.id2label[c_y], 'pred_label': self.id2label[preds[i]], 'text': batch['text'][i] } res.append(tmp_dict) y_true = torch.cat(y_true) y_pred = torch.cat(y_pred) cm = metrics.cal_cm(y_true, y_pred) acc_score = metrics.cal_accuracy(y_true, y_pred) recall = metrics.cal_recall(y_true, y_pred) f1_score = metrics.cal_f1(y_true, y_pred) label_range = [i for i in range(len(self.label_map))] target_name = [ x[0] for x in sorted(self.label_map.items(), key=lambda x: x[1]) ] report = metrics.get_classification_report(y_true, y_pred, label_range, target_name) return acc_score, recall, f1_score, cm, report, res def get_data_loader(self, f_path, batch_size): np.random.seed(14) texts, labels = prepare(f_path, self.label_map) ds = self.ds(self.tokenizer, texts, labels, self.max_len) return dataloader.DataLoader(ds, batch_size=batch_size, num_workers=self.conf.num_workers, shuffle=True)
device="cuda:0") train_dataloader = make_dataloader(train_dataset, batch_size=args.train_batch_size) dev_dataloader = make_dataloader(dev_dataset, batch_size=args.dev_batch_size, shuffle=False) if args.batch_size is None: args.batch_size = args.train_batch_size if args.batch_size % args.train_batch_size != 0: raise ValueError("GPU batch size should divide batch size per update.") batches_per_update = args.batch_size // args.train_batch_size bert_classifier = BertClassifier(model, state_key="pooler_output", lr=args.lr, accumulate_gradients=batches_per_update).to("cuda:0") best_score, best_weights = 0.0, None if args.load_file: bert_classifier.load_state_dict(torch.load(args.load_file)) if args.train: model.train() for epoch in range(args.nepochs): progress_bar = tqdm.tqdm(train_dataloader) metrics = initialize_metrics() for i, batch in enumerate(progress_bar, 1): outputs = bert_classifier.train_on_batch(batch) postfix = update_metrics(metrics, outputs, batch["labels"]) progress_bar.set_postfix(postfix) if (args.eval_every_n_batches > 0 and i % args.eval_every_n_batches == 0 and len(train_dataloader) - i >= args.eval_every_n_batches // 2) or\ i == len(train_dataloader): dev_metrics = initialize_metrics() dev_progress_bar = tqdm.tqdm(dev_dataloader) for j, batch in enumerate(dev_progress_bar):
import torch from model import BertClassifier from transformers import BertTokenizer, BertConfig from train import get_bert_input import pandas as pd labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经'] bert_config = BertConfig.from_pretrained('chinese_wwm_pytorch') bert_config.num_labels = len(labels) model = BertClassifier(bert_config) model.load_state_dict( torch.load('./best_model_on_trainset.pkl', map_location=torch.device('cpu'))) tokenizer = BertTokenizer(vocab_file='chinese_wwm_pytorch/vocab.txt') device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu') model = torch.nn.DataParallel(model, device_ids=[2]) model.to(device) def predict_text(text): input_id, attention_mask, token_type_id = get_bert_input(text, tokenizer) input_id = torch.tensor([input_id], dtype=torch.long) attention_mask = torch.tensor([attention_mask], dtype=torch.long) token_type_id = torch.tensor([token_type_id], dtype=torch.long) predicted = model( input_id,
# coding: utf-8 # @File: predict.py # @Author: HE D.H. # @Email: [email protected] # @Time: 2020/10/10 17:13:57 # @Description: import torch from model import BertClassifier from transformers import BertTokenizer, BertConfig labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经'] bert_config = BertConfig.from_pretrained('bert-base-chinese') model = BertClassifier(bert_config, len(labels)) model.load_state_dict( torch.load('models/best_model.pkl', map_location=torch.device('cpu'))) tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') print('新闻类别分类') while True: text = input('Input: ') token = tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True, max_length=512) input_ids = token['input_ids'] attention_mask = token['attention_mask'] token_type_ids = token['token_type_ids']
attention_mask = (x != 0).float().to(config.DEVICE).long() outputs = MODEL(x, attention_mask=attention_mask) return outputs.cpu().detach().numpy() @app.route('/predict') def predict(): comment = request.args.get('comment') start_time = time.time() prediction = comment_prediction(comment) response = { 'response': { label: str(prob) for label, prob in zip(config.CLASS_COLS, prediction[0]) } } response['response']['comment'] = comment response['response']['time_taken'] = str(time.time() - start_time) return flask.jsonify(response) if __name__ == '__main__': bert_config = BertConfig.from_pretrained(config.BERT_NAME) bert_config.num_labels = config.NUM_CLASSES MODEL = BertClassifier(bert_config) MODEL.load_state_dict(torch.load(config.TRAINED_MODEL_PATH)) MODEL.to(config.DEVICE) MODEL.eval() app.run(host=config.HOST, port=config.PORT)
class NLU: def __init__(self, config, RecognitionResultManager): self.config = config self.genre_keywords_db = self.get_db(self.config['DB']['host'], self.config['DB']['db_name'], self.config['DB']['user']) self.rrm = RecognitionResultManager self.model_path = "/Users/jinsakuma/Downloads/model_gpu_v4.3.3.pth" self.model_config = BertConfig.from_pretrained( 'cl-tohoku/bert-base-japanese-whole-word-masking', output_attentions=True) self.tokenizer = BertJapaneseTokenizer.from_pretrained( 'cl-tohoku/bert-base-japanese-whole-word-masking') self.bert_model = BertModel.from_pretrained( 'cl-tohoku/bert-base-japanese-whole-word-masking', config=self.model_config) self.model = BertClassifier(self.bert_model) self.max_len = 30 self.load_weights(self.model_path) self.device = torch.device("cpu") self.order_list = [ 'recommendation', 'title', 'abstract', 'review', 'evaluation', 'actor', 'genre', 'director', None ] def get_db(self, host="localhost", db_name="woz_system", user="******"): ''' MySQLから発話内容を一括取得 :return: db (dict) ''' connector = MySQLdb.connect(host=host, db=db_name, user=user, passwd="", charset="utf8") cursor = connector.cursor() # カーソル(概念)を作成 # 映画推薦用キーワード cursor.execute('select * from genre') genres = cursor.fetchall() genre_keywords_db = {} for genre in genres: genre_id = genre[1] genre_type = genre[2] # .encode('utf-8') genre_keywords_db[genre_type] = [] cursor.execute( 'select keywords from genre_keywords where genre_id={}'.format( genre_id)) keywords = cursor.fetchall() keyword_list = keywords[0][0].split(",") genre_keywords_db[genre_type] = keyword_list return genre_keywords_db def load_weights(self, model_path): load_weights = torch.load(model_path, map_location={'cuda:0': 'cpu'}) self.model.load_state_dict(load_weights) def bert_tokenizer(self, input_text): return self.tokenizer.encode(input_text, max_length=self.max_len, truncation=True, return_tensors='pt')[0] def get_order(self, input_text): token = self.bert_tokenizer(input_text) token = token.unsqueeze(0) output, attentions = self.model(token.to(self.device)) _, pred = torch.max(output, 1) print("NLU result: ", self.order_list[pred.item()]) return self.order_list[pred.item()] def get_text(self, N): df = self.rrm.get_df() text_list = df['transcript'].iloc[-N:].tolist() target_list = df['speaker'].iloc[-N:].tolist() return text_list, target_list def check_genre(self, input_texts): # キーワードマッチング for text in reversed(input_texts): for response_type, keywords in self.genre_keywords_db.items(): for keyword in keywords: if keyword in text: return response_type return None