예제 #1
0
def get_prediction(model, sentence, save_dir, mark='Eval', verbose=False):
    """Evaluate the model on `steps` batches."""
    # set model to evaluation mode
    model.eval()

    idx2lbl = load_obj(save_dir + "idx2lbl.json")
    idx2cls = load_obj(save_dir + "idx2cls.json")

    enc = sentence.to(device)
    enc_self_attn_mask = get_attn_pad_mask(enc, enc)
    enc_self_attn_mask.to(device)

    # get results from model
    logits_tgt, logits_clsf = model(enc, enc_self_attn_mask)

    # get sentence length
    pad_num = enc.data.eq(0).sum(axis=1)

    score_cls, cls_idx = torch.max(logits_clsf, dim=-1)
    pred_cls = cls_idx[0].data.tolist()

    # get valid slot for a specific intent
    idx_mask = load_mask(save_dir)

    masked_logits_tgt = softmax_mask(logits_tgt, cls_idx, idx_mask)
    score_tgt, tgt_idx = torch.max(masked_logits_tgt, dim=-1)

    pred_tags = tgt_idx[0, 0:-pad_num].data.tolist()

    pred_lbls = []
    for idx in pred_tags:
        pred_lbls.append(idx2lbl[str(idx)])
    pred_cls = idx2cls[str(pred_cls)]

    return pred_cls, pred_lbls
예제 #2
0
def save_dict_onnx(save_dir, save_onnx):
    config = load_obj(save_dir + "Config.json")
    word2idx = load_obj(save_dir + "dict.json")
    idx2lbl = load_obj(save_dir + "idx2lbl.json")
    idx2cls = load_obj(save_dir + "idx2cls.json")
    config_onnx = {}
    config_onnx['max_len'] = config['max_len']
    config_onnx['WORD'] = WORD
    config_onnx['BOS'] = BOS
    config_onnx['UNK'] = UNK
    config_onnx['PAD'] = PAD
    save_obj(word2idx, save_onnx + 'dict.json')
    save_obj(config_onnx, save_onnx + 'Config.json')
    save_obj(idx2lbl, save_onnx + "idx2lbl.json")
    save_obj(idx2cls, save_onnx + "idx2cls.json")
예제 #3
0
def load_save_mask(save_dir, save_onnx):
    config = load_obj(save_dir + "Config.json")
    num_label = config['num_label']
    dict_lbl = load_obj(save_dir + "dict_lbl.json")
    dict_clsf = load_obj(save_dir + "dict_clsf.json")
    lbl_mask = load_obj(save_dir + "lbl_mask.json")

    idx_mask = {}
    idx_mask_onnx = {}
    for intent in lbl_mask:
        valid_slot = lbl_mask[intent]
        mask = [0] * num_label
        for s in valid_slot:
            idx = dict_lbl[s]
            mask[idx] = 1
        idx_mask_onnx[dict_clsf[intent]] = mask
        idx_mask[dict_clsf[intent]] = torch.LongTensor(mask).to(device)
    save_obj(idx_mask_onnx, save_onnx + 'idx_mask_onnx.json')
    torch.save(idx_mask, save_dir + 'idx_mask.json')
예제 #4
0
def predict():
    parser = argparse.ArgumentParser(description='Transformer NER')
    # parser.add_argument('--corpus-data', type=str, default='../data/auto_only-nav-distance_BOI.txt',
    # help='path to corpus data')
    parser.add_argument('--save-dir',
                        type=str,
                        default='./data_char/',
                        help='path to save processed data')
    parser.add_argument('--pre-w2v', type=str, default='../data/w2v')
    args = parser.parse_args()

    pre_w2v = torch.load(args.save_dir + 'pre_w2v')
    pre_w2v = torch.Tensor(pre_w2v).to(device)

    model_ckpt = torch.load(os.path.join(
        args.save_dir, '{}.pyt'.format("Transformer_NER_best")),
                            map_location=torch.device(device))

    config = load_obj(args.save_dir + 'Config.json')
    model = Transformer_Mix(config, pre_w2v).to(device)
    model.load_state_dict(model_ckpt['model'])
    model.eval()

    data_loader = DataLoader_test(args.save_dir)

    input_sentence = request.args.get('text')
    tokens, test_data = data_loader.load_sentences(input_sentence)
    pred_cls, pred_lbls = get_prediction(model,
                                         test_data,
                                         args.save_dir,
                                         mark='Test',
                                         verbose=True)
    slot = pretty_print(tokens, pred_lbls, pred_cls)
    return render_template('result.html',
                           input_sentence=input_sentence,
                           pred_cls=''.join(pred_cls),
                           pred_lbls=' '.join(pred_lbls),
                           slot=slot)
예제 #5
0
def load_mask(save_dir):
    config = load_obj(save_dir + "Config.json")
    num_label = config['num_label']
    idx2lbl = load_obj(save_dir + "idx2lbl.json")
    idx2cls  = load_obj(save_dir + "idx2cls.json")
    dict_lbl = load_obj(save_dir + "dict_lbl.json")
    dict_clsf  = load_obj(save_dir + "dict_clsf.json")
    lbl_mask = load_obj(save_dir + "lbl_mask.json")

    idx_mask = {}
    for intent in lbl_mask:
        valid_slot = lbl_mask[intent]
        mask = [0] * num_label
        for s in valid_slot:
            idx = dict_lbl[s]
            mask[idx] = 1
        idx_mask[dict_clsf[intent]] = torch.LongTensor(mask).to(device)
        # print(idx_mask)
    return idx_mask
예제 #6
0
 def __init__(self, save_dir):
     self.save_dir = save_dir
     self.word2idx = load_obj(self.save_dir + "dict.json")
     self.config = load_obj(self.save_dir + "Config.json")
     self.max_len = self.config["max_len"]
예제 #7
0
def evaluate_f1_no_mask(model, dl_test, save_dir, criterion_clsf = nn.CrossEntropyLoss().to(device), criterion_tgt = nn.CrossEntropyLoss(ignore_index=PAD).to(device), verbose = False):
    loss_test = 0
    pred_tags = []
    true_tags = []

    pred_clss = []
    true_clss = []
    criterion_clsf = criterion_clsf
    criterion_tgt = criterion_tgt
    idx2lbl = load_obj(save_dir+'idx2lbl.json')
    for enc, tgt, cls in dl_test[:]:
        model.eval()
        with torch.no_grad():
            enc = enc.to(device)
            tgt = tgt.to(device)
            cls = cls.to(device)
            enc_self_attn_mask = get_attn_pad_mask(enc, enc)
            enc_self_attn_mask.to(device)

            logits_tgt, logits_clsf = model(enc,enc_self_attn_mask)
            loss_tgt = criterion_tgt(logits_tgt.transpose(1, 2), tgt) # for masked LM
            loss_tgt = (loss_tgt.float()).mean()
            loss_clsf = criterion_clsf(logits_clsf, cls)# for sentence classification
            loss = loss_clsf + loss_tgt
            # loss = loss_clsf
            loss_test+=loss

        pad_mask = enc.data.eq(0).sum(axis = 1)

        score_tgt, tgt_idx = torch.max(logits_tgt,dim = -1)
        score_cls, cls_idx = torch.max(logits_clsf, dim = -1)

        for pre, true, pad_num in zip(tgt_idx, tgt, pad_mask):
            pred_tags += pre[0:-pad_num].data.tolist()
            true_tags += true[0:-pad_num].data.tolist()

        # print(cls_idx.size())
        pred_clss += cls_idx.tolist()
        true_clss += cls.tolist()
        # print(len(pred_tags), len(true_tags))
        # print(pred_tags)
        # print(true_tags)
        # print(len(pred_clss), len(true_clss))
        # print(pred_clss)

        # print(true_clss)
        assert len(pred_tags) == len(true_tags)
        assert len(pred_clss) == len(true_clss)
    # print(pred_clss[-20:])
    # print(true_clss[-20:])
    # print(pred_tags[-20:])
    # print(true_tags[-20:])

    # print(enc[-20:])

    f1_tgt = f1_score(pred_tags, true_tags, average='micro')
    f1_cls = f1_score(pred_clss, true_clss, average='micro')

    # logging loss, f1 and report

    metrics = {}
    true_lbls = []
    pred_lbls = []

    for t,p in zip(true_tags,pred_tags):
        true_lbls.append(idx2lbl[str(t)])
        pred_lbls.append(idx2lbl[str(p)])

    f1_tgt_merged = f1_score_merged(true_lbls, pred_lbls)

    if verbose:
        report = classification_report(true_lbls, pred_lbls)
        print("============no_mask_slot================")
        print(report, flush=True)

    return loss_test/len(dl_test), f1_cls*100, f1_tgt*100, f1_tgt_merged
예제 #8
0
                        help='path to save processed data')

    parser.add_argument('--pre-w2v', type=str, default='../data/w2v')
    args = parser.parse_args()
    
    args.corpus_data = args.save_dir + args.corpus_name
    # corpus = Corpus(args.corpus_data, args.pre_w2v, args.save_dir)

    dl = DataLoader(args.save_dir, batch_size = 128)()
    dl_train, dl_test = train_test_split(dl, test_size=0.33)
    pre_w2v = torch.load(args.save_dir + 'pre_w2v')
    pre_w2v = torch.Tensor(pre_w2v).to(device)
    

    model_ckpt = torch.load(os.path.join(args.save_dir, '{}.pyt'.format("Transformer_NER_best")),map_location=torch.device(device))
    config = load_obj(args.save_dir+'Config.json')
    # cls_size = config['num_class']
    # tgt_size = config['num_label']
    model =Transformer_Mix(config, pre_w2v).to(device)
    model.load_state_dict (model_ckpt['model'])
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    optimizer.load_state_dict(model_ckpt['model_opt'])

    loss_epoch_test = 0
    pred_tags = []
    true_tags = []

    criterion_clsf = nn.CrossEntropyLoss().to(device)
    criterion_tgt = nn.CrossEntropyLoss(ignore_index=PAD).to(device)

    loss_epoch_test, f1_cls, f1_tgt, f1_tgt_merged = evaluate_f1(model, dl_test, args.save_dir, verbose=1)
예제 #9
0
def generate_report_txt(
        model,
        dl_test,
        save_dir,
        criterion_clsf=nn.CrossEntropyLoss().to(device),
        criterion_tgt=nn.CrossEntropyLoss(ignore_index=PAD).to(device),
        verbose=False):
    loss_test = 0
    pred_tags = []
    true_tags = []

    pred_clss = []
    true_clss = []
    criterion_clsf = criterion_clsf
    criterion_tgt = criterion_tgt

    idx2lbl = load_obj(save_dir + 'idx2lbl.json')
    idx2cls = load_obj(save_dir + "idx2cls.json")

    sents = load_obj(save_dir + "TestDataSentence.txt")
    for enc, tgt, cls in dl_test[:]:
        model.eval()
        with torch.no_grad():
            enc = enc.to(device)
            tgt = tgt.to(device)
            cls = cls.to(device)
            enc_self_attn_mask = get_attn_pad_mask(enc, enc)
            enc_self_attn_mask.to(device)

            logits_tgt, logits_clsf = model(enc, enc_self_attn_mask)
            loss_tgt = criterion_tgt(logits_tgt.transpose(1, 2),
                                     tgt)  # for masked LM
            loss_tgt = (loss_tgt.float()).mean()
            loss_clsf = criterion_clsf(logits_clsf,
                                       cls)  # for sentence classification
            loss = loss_clsf + loss_tgt
            # loss = loss_clsf
            loss_test += loss

        pad_mask = enc.data.eq(0).sum(axis=1)

        score_cls, cls_idx = torch.max(logits_clsf, dim=-1)
        # get valid slot for a specific intent
        idx_mask = load_mask(save_dir)
        masked_logits_tgt = softmax_mask(logits_tgt, cls_idx, idx_mask)
        score_tgt, tgt_idx = torch.max(masked_logits_tgt, dim=-1)

        for pre, true, pad_num in zip(tgt_idx, tgt, pad_mask):
            pred_tags.append(pre[0:-pad_num].data.tolist())
            true_tags.append(true[0:-pad_num].data.tolist())

        pred_clss += cls_idx.tolist()
        true_clss += cls.tolist()

    print("Prediction completed", flush=True)

    lines_correct = []
    lines_intent_error = []
    lines_slot_error = []
    for idx in range(len(true_clss)):
        tokens = sents[idx].split(' ')
        true_lbls = []
        pred_lbls = []
        true_tags_idx = true_tags[idx]
        pred_tags_idx = pred_tags[idx]
        for t, p in zip(true_tags_idx, pred_tags_idx):
            true_lbls.append(idx2lbl[str(t)])
            pred_lbls.append(idx2lbl[str(p)])

        true_entities = get_entities(true_lbls)
        pred_entities = get_entities(pred_lbls)

        slots_true = []
        slots_pred = []

        for chunk_true, chunk_pred, cls_true, cls_pred in zip(
                true_entities, pred_entities, true_clss, pred_clss):
            tag, start, end = chunk_true[0], chunk_true[1], chunk_true[2]
            tok = ''.join(tokens[start:end + 1])
            slot_true = '<{0}>: {1}'.format(tag, tok)
            slots_true.append(slot_true)

            tag, start, end = chunk_pred[0], chunk_pred[1], chunk_pred[2]
            tok = ''.join(tokens[start:end + 1])
            slot_pred = '<{0}>: {1}'.format(tag, tok)
            slots_pred.append(slot_pred)

        intent_true = idx2cls[str(true_clss[idx])]
        intent_pred = idx2cls[str(pred_clss[idx])]

        line = "Sentence:{0:}\nExpect: \t{1}\t{2}\nPredict:\t{3}\t{4}\n".format(
            sents[idx], intent_true, slots_true, intent_pred, slots_pred)
        if intent_true != intent_pred:
            lines_intent_error.append(line)
        elif slots_true != slots_pred:
            lines_slot_error.append(line)
        else:
            lines_correct.append(line)

    correct_num = len(lines_correct)
    intent_w_num = len(lines_intent_error)
    slot_w_num = len(lines_slot_error)
    total_line = len(lines_intent_error) + len(lines_correct) + len(
        lines_slot_error)

    score1 = 'total line = {0}; Exact match = {1}, with intent fail = {2}, with slot fail = {3};'.format(
        total_line, correct_num, intent_w_num, slot_w_num)
    score2 = 'Accuracy = {0:.4f}'.format(correct_num / total_line)
    scores = [score1, score2]

    # saveing report
    print("Saving reports...", flush=True)
    report_dir = os.path.join(save_dir, 'reports', '')
    create_dir(report_dir)

    remove_old_file(report_dir + 'reports_correct.txt')
    remove_old_file(report_dir + 'reports_intent_error.txt')
    remove_old_file(report_dir + 'reports_slot_error.txt')
    remove_old_file(report_dir + 'scores.txt')

    with open(report_dir + 'reports_correct.txt', 'w', encoding='utf-8') as f:
        for line in lines_correct:
            f.write("{0}".format(line + '\n'))
    with open(report_dir + 'reports_intent_error.txt', 'w',
              encoding='utf-8') as f:
        for line in lines_intent_error:
            f.write("{0}".format(line + '\n'))
    with open(report_dir + 'reports_slot_error.txt', 'w',
              encoding='utf-8') as f:
        for line in lines_slot_error:
            f.write("{0}".format(line + '\n'))
    with open(report_dir + 'scores.txt', 'w', encoding='utf-8') as f:
        for line in scores:
            f.write("{0}".format(line + '\n'))