예제 #1
0
def process_query(query, hp, model):
    s = query
    split_s = ["[CLS]"] + s.split() + ["[SEP]"]
    x = []  # list of ids
    is_heads = []  # list. 1: the token is the first piece of a word

    for w in split_s:
        tokens = hp.tokenizer.tokenize(w) if w not in ("[CLS]",
                                                       "[SEP]") else [w]
        xx = hp.tokenizer.convert_tokens_to_ids(tokens)
        is_head = [1] + [0] * (len(tokens) - 1)
        x.extend(xx)
        is_heads.extend(is_head)

    x = torch.LongTensor(x).unsqueeze(dim=0)

    # Process query
    model.eval()
    hp = HParams('i2b2')
    hidden = model.init_eval_hidden(hp.batch_size)
    _, _, y_pred = model(x, hidden)  # just a dummy y value
    preds = y_pred[0].cpu().numpy()[np.array(is_heads) ==
                                    1]  # Get prediction where head is 1

    # convert to real tags and remove <SEP> and <CLS>  tokens labels
    preds = [hp.idx2tag[i] for i in preds][1:-1]
    final_output = []
    for word, label in zip(s.split(), preds):
        final_output.append([word, label])
    return final_output
예제 #2
0
def get_ner(query):
    hp = HParams('i2b2')
    #print("i2b2 -> ", query)
    out = process_query(query=query, hp=hp, model=i2b2_model)
    result = []
    ners = []
    for op in out:
        if op[1] == 'O':
            result.append(op[0])
        elif op[1] == 'B-problem':
            result.append('problem')
            ners.append(op[0])
        elif op[1] == 'I-problem':
            if ners:
                ners[-1] = f"{ners[-1]} {op[0]}"
            else:
                ners.append(op[0])
                result.append('problem')
        elif op[1] == 'B-test':
            result.append('test')
            ners.append(op[0])
        elif op[1] == 'I-test':
            if ners:
                ners[-1] = f"{ners[-1]} {op[0]}"
            else:
                ners.append(op[0])
                result.append('test')
        elif op[1] == 'B-treatment':
            result.append('treatment')
            ners.append(op[0])
        elif op[1] == 'I-treatment':
            ners[-1] = f"{ners[-1]} {op[0]}"

    result = " ".join(result)
    return result, ners
예제 #3
0
def process_query(query, hp, model):
    s = query
    split_s = s.split()
    x = [] # list of ids

    for w in split_s:
        tokens = hp.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
        xx = hp.tokenizer.convert_tokens_to_ids(tokens)
        x.extend(xx)

    x = torch.LongTensor(x).unsqueeze(dim=0)

    # Process query 
    model.eval()
    hp = HParams('relations')
    hidden = model.init_eval_hidden(hp.batch_size)
    _, _, y_pred = model(x, hidden)  # just a dummy y value
    preds = y_pred[0].cpu().numpy()[np.array(is_heads) == 1]  # Get prediction where head is 1 

    # convert to real tags and remove <SEP> and <CLS>  tokens labels 
    preds = [hp.idx2tag[i] for i in preds]
    return preds
예제 #4
0
import numpy as np
from pytorch_pretrained_bert.modeling import BertConfig
import parameters
from collections import OrderedDict
import json
from torch.autograd import Variable
from sklearn.metrics import f1_score
from functools import partial
import pickle
from sklearn.metrics import precision_recall_fscore_support
# device = 'cpu'
device = 'cuda'

model_state_dict = torch.load('./weights/save_file')

hp = HParams('i2b2')
clip = 5


def train(model, iterator, optimizer, criterion):

    model.train()
    model = model.to(device)
    hidden = model.init_hidden(hp.batch_size)

    for i, batch in enumerate(iterator):
        if (i < 30):
            words, x, is_heads, tags, y, seqlens = batch
            _y = y  # for monitoring
            hidden = tuple([each.data for each in hidden])
예제 #5
0
        fout.write(f"f1={f1}\n")

    os.remove(f)

    print("precision=%.2f" % precision)
    print("recall=%.2f" % recall)
    print("f1=%.2f" % f1)
    return precision, recall, f1


if __name__ == "__main__":

    train_dataset = NerDataset("data/train.tsv",
                               'bc5cdr')  # here bc5cdr is dataset type
    eval_dataset = NerDataset("data/test.tsv", 'bc5cdr')
    hp = HParams('bc5cdr')

    # Define model
    config = BertConfig(
        vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE)
    model = Net(config=config,
                bert_state_dict=state_dict,
                vocab_len=len(hp.VOCAB),
                device=hp.device)
    if torch.cuda.is_available():
        model.cuda()
    model.train()
    # update with already pretrained weight

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=hp.batch_size,
예제 #6
0
def get_bionlp13cg(query):
    hp = HParams('bionlp3g')
    print("bionlp3g -> ", query)
    out = process_query(query=query, hp=hp, model=bionlp13cg_model)
    return JSONResponse({'tags': out})
예제 #7
0
def get_bc5cdr(query):
    hp = HParams('bc5cdr')
    print("bc5cdr -> ", query)
    out = process_query(query=query, hp=hp, model=bc5_model)
    return JSONResponse({'tagging': out})
예제 #8
0
import uvicorn
import aiohttp

config = BertConfig(vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE)
app = Starlette()


def build_model(config, state_dict, hp):
    model = Net(config, vocab_len=len(hp.VOCAB), bert_state_dict=None)
    _ = model.load_state_dict(torch.load(state_dict, map_location='cpu'))
    _ = model.to('cpu')  # inference
    return model


# Model loaded
bc5_model = build_model(config, parameters.BC5CDR_WEIGHT, HParams('bc5cdr'))
bionlp13cg_model = build_model(config, parameters.BIONLP13CG_WEIGHT,
                               HParams('bionlp3g'))


# Process Query
def process_query(query, hp, model):
    s = query
    split_s = ["[CLS]"] + s.split() + ["[SEP]"]
    x = []  # list of ids
    is_heads = []  # list. 1: the token is the first piece of a word

    for w in split_s:
        tokens = hp.tokenizer.tokenize(w) if w not in ("[CLS]",
                                                       "[SEP]") else [w]
        xx = hp.tokenizer.convert_tokens_to_ids(tokens)
예제 #9
0
from pytorch_pretrained_bert import BertModel
import parameters
import numpy as np
import torch

config = BertConfig(vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE)


def build_model(config, state_dict, hp):
    model = Net(config, vocab_len=len(hp.VOCAB), bert_state_dict=None)
    _ = model.load_state_dict(torch.load(state_dict, map_location='cpu'))
    _ = model.to('cpu')  # inference
    return model


i2b2_model = build_model(config, parameters.I2b2_WEIGHTS, HParams('i2b2'))


# Process Query
def process_query(query, hp, model):
    s = query
    split_s = ["[CLS]"] + s.split() + ["[SEP]"]
    x = []  # list of ids
    is_heads = []  # list. 1: the token is the first piece of a word

    for w in split_s:
        tokens = hp.tokenizer.tokenize(w) if w not in ("[CLS]",
                                                       "[SEP]") else [w]
        xx = hp.tokenizer.convert_tokens_to_ids(tokens)
        is_head = [1] + [0] * (len(tokens) - 1)
        x.extend(xx)
예제 #10
0
def get_i2b2(query):
    hp = HParams('relations')
    print("relations -> ", query)
    out = process_query(query=query, hp=hp, model=i2b2_model)
    return out
예제 #11
0
from pytorch_pretrained_bert.modeling import BertConfig
from pytorch_pretrained_bert import BertModel
import parameters
import numpy as np 
import torch

config = BertConfig(vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE)

def build_model(config, state_dict, hp):
    model = RelNet(config, vocab_len = len(hp.VOCAB), bert_state_dict=None)
    _ = model.load_state_dict(torch.load(state_dict, map_location='cpu'))
    _ = model.to('cpu')  # inference 
    return model 


i2b2_model = build_model(config, parameters.I2b2_WEIGHTS, HParams('relations'))

# Process Query 
def process_query(query, hp, model):
    s = query
    split_s = s.split()
    x = [] # list of ids

    for w in split_s:
        tokens = hp.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
        xx = hp.tokenizer.convert_tokens_to_ids(tokens)
        x.extend(xx)

    x = torch.LongTensor(x).unsqueeze(dim=0)

    # Process query 
예제 #12
0
        fout.write(f"recall={recall}\n")
        fout.write(f"f1={f1}\n")
        fout.write(f"accuracy={accuracy}\n")

    os.remove(f)

    print("precision=%.2f" % precision)
    print("recall=%.2f" % recall)
    print("f1=%.2f" % f1)
    print("accuracy=%.2f" % accuracy)
    return precision, recall, f1


if __name__ == "__main__":

    hp = HParams('i2b2')

    train_on_gpu = torch.cuda.is_available()
    hp = HParams('relations')
    relations_train_dataset = RelationDataset(
        "Data/formatted/relationsTrainFinal.tsv", 'relations')
    relations_eval_dataset = RelationDataset(
        "Data/formatted/relationsTestFinal.tsv", 'relations')

    # Define model
    config = BertConfig(
        vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE)

    model = RelNet(config=config,
                   bert_state_dict=state_dict,
                   vocab_len=len(hp.VOCAB),