def predict(config=None, model=None, sent=None): """ Input: raw sentences saved in config.input_file or sent Output: results of trigger identification saved in config.tri_id_result_file format: sentence ||| tag (BIO) """ # load config if not config: config = Config() vocab = load_vocab(config.vocab) label_dic = load_vocab(config.tri_id_label_file) tagset_size = len(label_dic) # load trained model if not model: model = BertLstmCrf(config.bert_path, tagset_size, config.bert_embedding, config.rnn_hidden, config.rnn_layer, dropout_ratio=config.dropout_ratio, dropout1=config.dropout1, use_cuda=config.use_cuda) model = load_model(model, name=config.load_path) if config.use_cuda: model.cuda() # begin predicting if (not config.input_file) and sent: # preprocess sent sent = sent.lower() tokens = sent.split() tokens = tokens[0:min(config.max_length - 2, len(tokens))] tokens_f = ['[CLS]'] + tokens + ['[SEP]'] input_ids = torch.LongTensor([[ int(vocab[i]) if i in vocab else int(vocab['[UNK]']) for i in tokens_f ]]) input_masks = torch.LongTensor([[1] * len(input_ids[0])]) if config.use_cuda and torch.cuda.is_available(): input_ids, input_masks = input_ids.cuda(), input_masks.cuda() # predict tags with torch.no_grad(): feats = model(input_ids, input_masks) path_score, best_path = model.crf(feats, input_masks) pred_label = best_path[0].cpu().numpy().tolist() pred_label = [list(label_dic.keys())[int(x)] for x in pred_label[1:-1]] return pred_label else: with open(config.input_file, 'r', encoding='utf-8') as f: sents = f.readlines() data = [] for line in sents: line = line.lower() tokens = line.split() tokens = tokens[0:min(config.max_length - 2, len(tokens))] tokens_f = ['[CLS]'] + tokens + ['[SEP]'] input_ids = [ int(vocab[i]) if i in vocab else int(vocab['[UNK]']) for i in tokens_f ] input_masks = [1] * len(input_ids) while len(input_ids) < config.max_length: input_ids.append(0) input_masks.append(0) data.append((input_ids, input_masks)) ids = torch.LongTensor([temp[0] for temp in data]) masks = torch.LongTensor([temp[1] for temp in data]) dataset = TensorDataset(ids, masks) loader = DataLoader(dataset, shuffle=False, batch_size=config.batch_size) sents = [] pred = [] for i, batch in tqdm.tqdm(enumerate(loader)): inputs, masks = batch inputs, masks = Variable(inputs), Variable(masks) masks = masks.bool() # save sentences for idx in range(inputs.shape[0]): sents.append(inputs[idx][masks[idx]].cpu().numpy().tolist()) # predict labels if config.use_cuda: inputs, masks = inputs.cuda(), masks.cuda() with torch.no_grad(): feats = model(inputs, masks) path_score, best_path = model.crf(feats, masks.byte()) # save labels for idx in range(inputs.shape[0]): pred.append(best_path[idx][masks[idx]].cpu().numpy().tolist()) # save result save_results(sents, pred, config) return pred
accuracy) print('eval epoch: {}| loss: {}'.format(epoch, eval_loss / length)) model.train() return eval_loss if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--do-train', type=bool, default=False, help='Whether to retrain the model.') parser.add_argument('--do-eval', type=bool, default=False, help='Whether to perform evaluation.') parser.add_argument('--with-golden-trigger', type=bool, default=False, help='Whether to evaluate with golden triggers.') args = parser.parse_args() if args.do_train: train() if args.do_eval: if not args.with_golden_trigger: predict() else: config = Config() config.tri_id_result_file = './data/tri_id_test.txt' config.gold_trigger_file = './data/golden_test.txt' predict(config)
def train(config=None): """Train Model""" # load config if not config: config = Config() print('settings:\n', config) # load corpus print('loading corpus.') vocab = load_vocab(config.vocab) label_dic = load_vocab(config.tri_id_label_file) tagset_size = len(label_dic) # load train and dev dataset train_data = read_corpus_tr_id(config.tri_id_train_file, max_length=config.max_length, label_dic=label_dic, vocab=vocab) train_ids = torch.LongTensor([temp[0] for temp in train_data]) train_masks = torch.LongTensor([temp[1] for temp in train_data]) train_tags = torch.LongTensor([temp[2] for temp in train_data]) train_dataset = TensorDataset(train_ids, train_masks, train_tags) train_loader = DataLoader(train_dataset, shuffle=True, batch_size=config.batch_size) dev_data = read_corpus_tr_id(config.tri_id_dev_file, max_length=config.max_length, label_dic=label_dic, vocab=vocab) dev_ids = torch.LongTensor([temp[0] for temp in dev_data]) dev_masks = torch.LongTensor([temp[1] for temp in dev_data]) dev_tags = torch.LongTensor([temp[2] for temp in dev_data]) dev_dataset = TensorDataset(dev_ids, dev_masks, dev_tags) dev_loader = DataLoader(dev_dataset, shuffle=True, batch_size=config.batch_size) # init model model = BertLstmCrf(config.bert_path, tagset_size, config.bert_embedding, config.rnn_hidden, config.rnn_layer, dropout_ratio=config.dropout_ratio, dropout1=config.dropout1, use_cuda=config.use_cuda) if config.load_model: assert config.load_path is not None model = load_model(model, name=config.load_path) if config.use_cuda: model.cuda() # train model print('begin training.') model.train() optimizer = getattr(optim, config.optim) optimizer = optimizer(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) eval_loss = 10000 for epoch in tqdm.tqdm(range(config.base_epoch)): for i, batch in tqdm.tqdm(enumerate(train_loader)): model.zero_grad() inputs, masks, tags = batch inputs, masks, tags = Variable(inputs), Variable(masks), Variable( tags) masks = masks.bool() if config.use_cuda: inputs, masks, tags = inputs.cuda(), masks.cuda(), tags.cuda() feats = model(inputs, masks) loss = model.loss(feats, masks, tags) loss.backward() optimizer.step() # save best model dev_loss_temp = evaluate(model, dev_loader, epoch, config) if dev_loss_temp < eval_loss: print('dev loss: ', eval_loss, ' -> ', dev_loss_temp) eval_loss = dev_loss_temp save_model(model, epoch) return model