def process_query(query, hp, model): s = query split_s = ["[CLS]"] + s.split() + ["[SEP]"] x = [] # list of ids is_heads = [] # list. 1: the token is the first piece of a word for w in split_s: tokens = hp.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w] xx = hp.tokenizer.convert_tokens_to_ids(tokens) is_head = [1] + [0] * (len(tokens) - 1) x.extend(xx) is_heads.extend(is_head) x = torch.LongTensor(x).unsqueeze(dim=0) # Process query model.eval() hp = HParams('i2b2') hidden = model.init_eval_hidden(hp.batch_size) _, _, y_pred = model(x, hidden) # just a dummy y value preds = y_pred[0].cpu().numpy()[np.array(is_heads) == 1] # Get prediction where head is 1 # convert to real tags and remove <SEP> and <CLS> tokens labels preds = [hp.idx2tag[i] for i in preds][1:-1] final_output = [] for word, label in zip(s.split(), preds): final_output.append([word, label]) return final_output
def get_ner(query): hp = HParams('i2b2') #print("i2b2 -> ", query) out = process_query(query=query, hp=hp, model=i2b2_model) result = [] ners = [] for op in out: if op[1] == 'O': result.append(op[0]) elif op[1] == 'B-problem': result.append('problem') ners.append(op[0]) elif op[1] == 'I-problem': if ners: ners[-1] = f"{ners[-1]} {op[0]}" else: ners.append(op[0]) result.append('problem') elif op[1] == 'B-test': result.append('test') ners.append(op[0]) elif op[1] == 'I-test': if ners: ners[-1] = f"{ners[-1]} {op[0]}" else: ners.append(op[0]) result.append('test') elif op[1] == 'B-treatment': result.append('treatment') ners.append(op[0]) elif op[1] == 'I-treatment': ners[-1] = f"{ners[-1]} {op[0]}" result = " ".join(result) return result, ners
def process_query(query, hp, model): s = query split_s = s.split() x = [] # list of ids for w in split_s: tokens = hp.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w] xx = hp.tokenizer.convert_tokens_to_ids(tokens) x.extend(xx) x = torch.LongTensor(x).unsqueeze(dim=0) # Process query model.eval() hp = HParams('relations') hidden = model.init_eval_hidden(hp.batch_size) _, _, y_pred = model(x, hidden) # just a dummy y value preds = y_pred[0].cpu().numpy()[np.array(is_heads) == 1] # Get prediction where head is 1 # convert to real tags and remove <SEP> and <CLS> tokens labels preds = [hp.idx2tag[i] for i in preds] return preds
import numpy as np from pytorch_pretrained_bert.modeling import BertConfig import parameters from collections import OrderedDict import json from torch.autograd import Variable from sklearn.metrics import f1_score from functools import partial import pickle from sklearn.metrics import precision_recall_fscore_support # device = 'cpu' device = 'cuda' model_state_dict = torch.load('./weights/save_file') hp = HParams('i2b2') clip = 5 def train(model, iterator, optimizer, criterion): model.train() model = model.to(device) hidden = model.init_hidden(hp.batch_size) for i, batch in enumerate(iterator): if (i < 30): words, x, is_heads, tags, y, seqlens = batch _y = y # for monitoring hidden = tuple([each.data for each in hidden])
fout.write(f"f1={f1}\n") os.remove(f) print("precision=%.2f" % precision) print("recall=%.2f" % recall) print("f1=%.2f" % f1) return precision, recall, f1 if __name__ == "__main__": train_dataset = NerDataset("data/train.tsv", 'bc5cdr') # here bc5cdr is dataset type eval_dataset = NerDataset("data/test.tsv", 'bc5cdr') hp = HParams('bc5cdr') # Define model config = BertConfig( vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE) model = Net(config=config, bert_state_dict=state_dict, vocab_len=len(hp.VOCAB), device=hp.device) if torch.cuda.is_available(): model.cuda() model.train() # update with already pretrained weight train_iter = data.DataLoader(dataset=train_dataset, batch_size=hp.batch_size,
def get_bionlp13cg(query): hp = HParams('bionlp3g') print("bionlp3g -> ", query) out = process_query(query=query, hp=hp, model=bionlp13cg_model) return JSONResponse({'tags': out})
def get_bc5cdr(query): hp = HParams('bc5cdr') print("bc5cdr -> ", query) out = process_query(query=query, hp=hp, model=bc5_model) return JSONResponse({'tagging': out})
import uvicorn import aiohttp config = BertConfig(vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE) app = Starlette() def build_model(config, state_dict, hp): model = Net(config, vocab_len=len(hp.VOCAB), bert_state_dict=None) _ = model.load_state_dict(torch.load(state_dict, map_location='cpu')) _ = model.to('cpu') # inference return model # Model loaded bc5_model = build_model(config, parameters.BC5CDR_WEIGHT, HParams('bc5cdr')) bionlp13cg_model = build_model(config, parameters.BIONLP13CG_WEIGHT, HParams('bionlp3g')) # Process Query def process_query(query, hp, model): s = query split_s = ["[CLS]"] + s.split() + ["[SEP]"] x = [] # list of ids is_heads = [] # list. 1: the token is the first piece of a word for w in split_s: tokens = hp.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w] xx = hp.tokenizer.convert_tokens_to_ids(tokens)
from pytorch_pretrained_bert import BertModel import parameters import numpy as np import torch config = BertConfig(vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE) def build_model(config, state_dict, hp): model = Net(config, vocab_len=len(hp.VOCAB), bert_state_dict=None) _ = model.load_state_dict(torch.load(state_dict, map_location='cpu')) _ = model.to('cpu') # inference return model i2b2_model = build_model(config, parameters.I2b2_WEIGHTS, HParams('i2b2')) # Process Query def process_query(query, hp, model): s = query split_s = ["[CLS]"] + s.split() + ["[SEP]"] x = [] # list of ids is_heads = [] # list. 1: the token is the first piece of a word for w in split_s: tokens = hp.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w] xx = hp.tokenizer.convert_tokens_to_ids(tokens) is_head = [1] + [0] * (len(tokens) - 1) x.extend(xx)
def get_i2b2(query): hp = HParams('relations') print("relations -> ", query) out = process_query(query=query, hp=hp, model=i2b2_model) return out
from pytorch_pretrained_bert.modeling import BertConfig from pytorch_pretrained_bert import BertModel import parameters import numpy as np import torch config = BertConfig(vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE) def build_model(config, state_dict, hp): model = RelNet(config, vocab_len = len(hp.VOCAB), bert_state_dict=None) _ = model.load_state_dict(torch.load(state_dict, map_location='cpu')) _ = model.to('cpu') # inference return model i2b2_model = build_model(config, parameters.I2b2_WEIGHTS, HParams('relations')) # Process Query def process_query(query, hp, model): s = query split_s = s.split() x = [] # list of ids for w in split_s: tokens = hp.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w] xx = hp.tokenizer.convert_tokens_to_ids(tokens) x.extend(xx) x = torch.LongTensor(x).unsqueeze(dim=0) # Process query
fout.write(f"recall={recall}\n") fout.write(f"f1={f1}\n") fout.write(f"accuracy={accuracy}\n") os.remove(f) print("precision=%.2f" % precision) print("recall=%.2f" % recall) print("f1=%.2f" % f1) print("accuracy=%.2f" % accuracy) return precision, recall, f1 if __name__ == "__main__": hp = HParams('i2b2') train_on_gpu = torch.cuda.is_available() hp = HParams('relations') relations_train_dataset = RelationDataset( "Data/formatted/relationsTrainFinal.tsv", 'relations') relations_eval_dataset = RelationDataset( "Data/formatted/relationsTestFinal.tsv", 'relations') # Define model config = BertConfig( vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE) model = RelNet(config=config, bert_state_dict=state_dict, vocab_len=len(hp.VOCAB),