예제 #1
0
def decode(pred_path):
    pred_slot_tags, pred_intent_tags = data_reader.read_seqtag_data(
        pred_path, slot_tag_to_idx, intent_tag_to_idx)

    TP_1, FP_1, FN_1, TN_1 = 0.0, 0.0, 0.0, 0.0
    TP_2, FP_2, FN_2, TN_2 = 0.0, 0.0, 0.0, 0.0

    for idx, pred_line in enumerate(pred_slot_tags['data']):
        pred_seq = [idx_to_slot_tag[item] for item in pred_line]
        lab_seq = [
            idx_to_slot_tag[item] for item in valid_slot_tags['data'][idx]
        ]

        pred_chunks = acc.get_chunks(['O'] + pred_seq + ['O'])
        label_chunks = acc.get_chunks(['O'] + lab_seq + ['O'])
        for pred_chunk in pred_chunks:
            if pred_chunk in label_chunks:
                TP_1 += 1
            else:
                FP_1 += 1
        for label_chunk in label_chunks:
            if label_chunk not in pred_chunks:
                FN_1 += 1

    for idx, pred_line in enumerate(pred_intent_tags['data']):
        pred_seq = [0] * len(intent_tag_to_idx)
        lab_seq = [0] * len(intent_tag_to_idx)

        for item in pred_line:
            pred_seq[item] = 1
        for item in valid_intent_tags['data'][idx]:
            lab_seq[item] = 1

        for k in range(len(pred_seq)):
            if pred_seq[k] == 1 and lab_seq[k] == 1:
                TP_2 += 1
            if pred_seq[k] == 1 and lab_seq[k] == 0:
                FP_2 += 1
            if pred_seq[k] == 0 and lab_seq[k] == 1:
                FN_2 += 1

    if TP_1 == 0:
        p_1, r_1, f_1 = 0, 0, 0
    else:
        p_1, r_1, f_1 = 100*TP_1/(TP_1+FP_1), 100*TP_1 / \
            (TP_1+FN_1), 100*2*TP_1/(2*TP_1+FN_1+FP_1)

    if TP_2 == 0:
        p_2, r_2, f_2 = 0, 0, 0
    else:
        p_2, r_2, f_2 = 100*TP_2/(TP_2+FP_2), 100*TP_2 / \
            (TP_2+FN_2), 100*2*TP_2/(2*TP_2+FN_2+FP_2)

    return (p_1, r_1, f_1), (p_2, r_2, f_2)
def decode(data_feats, data_tags, data_class, output_path):
    data_index = np.arange(len(data_feats))
    losses = []
    TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0
    TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0
    with open(output_path, 'w') as f:
        for j in range(0, len(data_index), opt.test_batchSize):
            if opt.testing:
                words, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class(
                    data_feats,
                    data_tags,
                    data_class,
                    tag_to_idx,
                    class_to_idx,
                    data_index,
                    j,
                    opt.test_batchSize,
                    add_start_end=opt.bos_eos,
                    multiClass=opt.multiClass,
                    keep_order=opt.testing,
                    enc_dec_focus=opt.enc_dec,
                    device=opt.device)
            else:
                words, tags, raw_tags, classes, raw_classes, lens = data_reader.get_minibatch_with_class(
                    data_feats,
                    data_tags,
                    data_class,
                    tag_to_idx,
                    class_to_idx,
                    data_index,
                    j,
                    opt.test_batchSize,
                    add_start_end=opt.bos_eos,
                    multiClass=opt.multiClass,
                    keep_order=opt.testing,
                    enc_dec_focus=opt.enc_dec,
                    device=opt.device)

            inputs = prepare_inputs_for_bert_xlnet(
                words,
                lens,
                tokenizer,
                cls_token_at_end=bool(opt.pretrained_model_type in ['xlnet']
                                      ),  # xlnet has a cls token at the end
                cls_token=tokenizer.cls_token,
                sep_token=tokenizer.sep_token,
                cls_token_segment_id=2
                if opt.pretrained_model_type in ['xlnet'] else 0,
                pad_on_left=bool(opt.pretrained_model_type in
                                 ['xlnet']),  # pad on the left for xlnet
                pad_token_segment_id=4
                if opt.pretrained_model_type in ['xlnet'] else 0,
                device=opt.device)

            if opt.enc_dec:
                opt.greed_decoding = True
                if opt.greed_decoding:
                    tag_scores_1best, outputs_1best, encoder_info = model_tag.decode_greed(
                        inputs, tags[:, 0:1], lens, with_snt_classifier=True)
                    tag_loss = tag_loss_function(
                        tag_scores_1best.contiguous().view(
                            -1, len(tag_to_idx)),
                        tags[:, 1:].contiguous().view(-1))
                    top_pred_slots = outputs_1best.cpu().numpy()
                else:
                    beam_size = 2
                    beam_scores_1best, top_path_slots, encoder_info = model_tag.decode_beam_search(
                        inputs,
                        lens,
                        beam_size,
                        tag_to_idx,
                        with_snt_classifier=True)
                    top_pred_slots = [[item[0].item() for item in seq]
                                      for seq in top_path_slots]
                    ppl = beam_scores_1best.cpu() / torch.tensor(
                        lens, dtype=torch.float)
                    tag_loss = ppl.exp().sum()
                #tags = tags[:, 1:].data.cpu().numpy()
            elif opt.crf:
                max_len = max(lens)
                masks = [([1] * l) + ([0] * (max_len - l)) for l in lens]
                masks = torch.tensor(masks,
                                     dtype=torch.uint8,
                                     device=opt.device)
                crf_feats, encoder_info = model_tag._get_lstm_features(
                    inputs, lens, with_snt_classifier=True)
                tag_path_scores, tag_path = model_tag.forward(crf_feats, masks)
                tag_loss = model_tag.neg_log_likelihood(crf_feats, masks, tags)
                top_pred_slots = tag_path.data.cpu().numpy()
            else:
                tag_scores, encoder_info = model_tag(inputs,
                                                     lens,
                                                     with_snt_classifier=True)
                tag_loss = tag_loss_function(
                    tag_scores.contiguous().view(-1, len(tag_to_idx)),
                    tags.view(-1))
                top_pred_slots = tag_scores.data.cpu().numpy().argmax(axis=-1)
                #tags = tags.data.cpu().numpy()
            if opt.task_sc:
                class_scores = model_class(encoder_info_filter(encoder_info))
                class_loss = class_loss_function(class_scores, classes)
                if opt.multiClass:
                    snt_probs = class_scores.data.cpu().numpy()
                else:
                    snt_probs = class_scores.data.cpu().numpy().argmax(axis=-1)
                losses.append([
                    tag_loss.item() / sum(lens),
                    class_loss.item() / len(lens)
                ])
            else:
                losses.append([tag_loss.item() / sum(lens), 0])

            #classes = classes.data.cpu().numpy()
            for idx, pred_line in enumerate(top_pred_slots):
                length = lens[idx]
                pred_seq = [idx_to_tag[tag] for tag in pred_line][:length]
                lab_seq = [
                    idx_to_tag[tag] if type(tag) == int else tag
                    for tag in raw_tags[idx]
                ]
                pred_chunks = acc.get_chunks(['O'] + pred_seq + ['O'])
                label_chunks = acc.get_chunks(['O'] + lab_seq + ['O'])
                for pred_chunk in pred_chunks:
                    if pred_chunk in label_chunks:
                        TP += 1
                    else:
                        FP += 1
                for label_chunk in label_chunks:
                    if label_chunk not in pred_chunks:
                        FN += 1

                input_line = words[idx]
                word_tag_line = [
                    input_line[_idx] + ':' + lab_seq[_idx] + ':' +
                    pred_seq[_idx] for _idx in range(len(input_line))
                ]

                if opt.task_sc:
                    if opt.multiClass:
                        pred_classes = [
                            idx_to_class[i]
                            for i, p in enumerate(snt_probs[idx]) if p > 0.5
                        ]
                        gold_classes = [
                            idx_to_class[i] for i in raw_classes[idx]
                        ]
                        for pred_class in pred_classes:
                            if pred_class in gold_classes:
                                TP2 += 1
                            else:
                                FP2 += 1
                        for gold_class in gold_classes:
                            if gold_class not in pred_classes:
                                FN2 += 1
                        gold_class_str = ';'.join(gold_classes)
                        pred_class_str = ';'.join(pred_classes)
                    else:
                        pred_class = idx_to_class[snt_probs[idx]]
                        if type(raw_classes[idx]) == int:
                            gold_classes = {idx_to_class[raw_classes[idx]]}
                        else:
                            gold_classes = set(raw_classes[idx])
                        if pred_class in gold_classes:
                            TP2 += 1
                        else:
                            FP2 += 1
                            FN2 += 1
                        gold_class_str = ';'.join(list(gold_classes))
                        pred_class_str = pred_class
                else:
                    gold_class_str = ''
                    pred_class_str = ''

                if opt.testing:
                    f.write(
                        str(line_nums[idx]) + ' : ' + ' '.join(word_tag_line) +
                        ' <=> ' + gold_class_str + ' <=> ' + pred_class_str +
                        '\n')
                else:
                    f.write(' '.join(word_tag_line) + ' <=> ' +
                            gold_class_str + ' <=> ' + pred_class_str + '\n')

    if TP == 0:
        p, r, f = 0, 0, 0
    else:
        p, r, f = 100 * TP / (TP + FP), 100 * TP / (TP + FN), 100 * 2 * TP / (
            2 * TP + FN + FP)

    mean_losses = np.mean(losses, axis=0)
    return mean_losses, p, r, f, 0 if 2 * TP2 + FN2 + FP2 == 0 else 100 * 2 * TP2 / (
        2 * TP2 + FN2 + FP2)
예제 #3
0
    if item[0] == 'B':
        B_type_list.append(item[2:])
    if item[0] == 'I':
        I_type_list.append(item[2:])
type_list = []
for item in B_type_list:
    if item in I_type_list:
        type_list.append((item, 3))
    else:
        type_list.append((item, 1))

out_slot_path = os.path.join(result_root, 'submission_slot.csv')
line_id = 1
with open(out_slot_path, 'w') as f:
    f.write('Id,Expected\n')
    for line in test_slot_tags['data']:
        all_chunks = []
        sentence_length = len(line)
        for Type, slot_length in type_list:
            for i in range(1, sentence_length + 1):
                for j in range(i, max(i + slot_length, sentence_length + 1)):
                    all_chunks.append((i, j, Type))
        lab_seq = [idx_to_slot_tag[slot] for slot in line]
        label_chunks = acc.get_chunks(['O'] + lab_seq + ['O'])
        for k in range(len(all_chunks)):
            if all_chunks[k] in label_chunks:
                f.write(str(line_id) + ',1\n')
            else:
                f.write(str(line_id) + ',0\n')
            line_id += 1
예제 #4
0
def decode(data_feats, data_slot_tags, output_path):
    data_index = np.arange(len(data_feats))
    losses = []
    TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0
    with open(output_path, 'w') as f:
        for j in range(0, len(data_index), opt.test_batchSize):
            if opt.testing:
                inputs, slot_tags, lens, line_nums, raw_words = data_reader.get_minibatch_with_unali_act(
                    data_feats,
                    data_slot_tags,
                    word_to_idx,
                    slot_tag_to_idx,
                    data_index,
                    j,
                    opt.test_batchSize,
                    add_start_end=opt.bos_eos,
                    keep_order=opt.testing,
                    raw_word=True,
                    enc_dec_focus=opt.enc_dec,
                    device=opt.device)
            else:
                inputs, slot_tags, lens = data_reader.get_minibatch_with_unali_act(
                    data_feats,
                    data_slot_tags,
                    word_to_idx,
                    slot_tag_to_idx,
                    data_index,
                    j,
                    opt.test_batchSize,
                    add_start_end=opt.bos_eos,
                    keep_order=opt.testing,
                    raw_word=False,
                    enc_dec_focus=opt.enc_dec,
                    device=opt.device)

            # slot tag
            if opt.enc_dec:
                opt.greed_decoding = True  #True, False
                if opt.greed_decoding:
                    slot_tag_scores_1best, pred_slot_tag_1best, h_t_c_t = model_tag.decode_greed(
                        inputs, slot_tags[:, 0:1], lens)
                    slot_tag_loss = slot_tag_loss_function(
                        slot_tag_scores_1best.contiguous().view(
                            -1, len(slot_tag_to_idx)),
                        slot_tags[:, 1:].contiguous().view(-1))
                    pred_slot_tag_1best = pred_slot_tag_1best.cpu().numpy()
                else:
                    beam_size = 2
                    beam_tag_scores_1best, pred_slot_tag_1best, _ = model_tag.decode_beam_search(
                        inputs, lens, beam_size, slot_tag_to_idx)
                    ppl = beam_tag_scores_1best.cpu() / torch.tensor(
                        lens, dtype=torch.float)
                    slot_tag_loss = ppl.exp().sum()
                    pred_slot_tag_1best = [[word[0].item() for word in line]
                                           for line in pred_slot_tag_1best]
                slot_tags = slot_tags[:, 1:].data.cpu().numpy()
            elif opt.crf:
                max_len = max(lens)
                masks = [([1] * l) + ([0] * (max_len - l)) for l in lens]
                masks = torch.tensor(masks,
                                     dtype=torch.uint8,
                                     device=opt.device)
                crf_feats, h_t_c_t = model_tag._get_lstm_features(inputs, lens)
                slot_tag_path_scores, slot_tag_path = model_tag.forward(
                    crf_feats, masks)
                slot_tag_loss = model_tag.neg_log_likelihood(
                    crf_feats, masks, slot_tags)
                pred_slot_tag_1best = slot_tag_path.data.cpu().numpy()
                slot_tags = slot_tags.data.cpu().numpy()
            else:
                slot_tag_scores, h_t_c_t = model_tag(inputs, lens)
                slot_tag_loss = slot_tag_loss_function(
                    slot_tag_scores.contiguous().view(-1,
                                                      len(slot_tag_to_idx)),
                    slot_tags.view(-1))
                pred_slot_tag_1best = slot_tag_scores.data.cpu().numpy(
                ).argmax(axis=-1)
                slot_tags = slot_tags.data.cpu().numpy()

            losses.append(slot_tag_loss.item() / sum(lens))

            inputs = inputs.data.cpu().numpy()
            for idx, pred_line in enumerate(pred_slot_tag_1best):
                length = lens[idx]
                # slot tag
                pred_seq = []
                for slot_tag in pred_line[:length]:
                    slot_tag = idx_to_slot_tag[slot_tag]
                    pred_seq.append(slot_tag)
                lab_seq = []
                for slot_tag in slot_tags[idx][:length]:
                    slot_tag = idx_to_slot_tag[slot_tag]
                    lab_seq.append(slot_tag)
                pred_chunks = acc.get_chunks(['O'] + pred_seq + ['O'])
                label_chunks = acc.get_chunks(['O'] + lab_seq + ['O'])
                for pred_chunk in pred_chunks:
                    if pred_chunk in label_chunks:
                        TP += 1
                    else:
                        FP += 1
                for label_chunk in label_chunks:
                    if label_chunk not in pred_chunks:
                        FN += 1
                if opt.testing:
                    input_line = raw_words[idx]
                else:
                    input_line = [idx_to_word[word]
                                  for word in inputs[idx]][:length]
                word_tag_line = [
                    input_line[_idx] + ':' + pred_seq[_idx]
                    for _idx in range(len(input_line))
                ]

                if opt.testing:
                    f.write(
                        str(line_nums[idx]) + ' : ' + ' '.join(word_tag_line) +
                        '\n')
                else:
                    f.write(' '.join(word_tag_line) + '\n')

    if TP == 0:
        p, r, f = 0, 0, 0
    else:
        p, r, f = 100 * TP / (TP + FP), 100 * TP / (TP + FN), 100 * 2 * TP / (
            2 * TP + FN + FP)

    mean_losses = np.mean(losses, axis=0)
    return mean_losses, p, r, f
def decode(sen_feats, data_feats, data_tags, data_class, output_path):
    data_index = np.arange(len(data_feats))
    losses = []
    TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0
    TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0
    with open(output_path, 'w') as f:
        for j in range(0, len(data_index), opt.test_batchSize):
            if opt.testing:
                inputs, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class(
                    data_feats,
                    data_tags,
                    data_class,
                    word_to_idx,
                    tag_to_idx,
                    class_to_idx,
                    data_index,
                    j,
                    opt.test_batchSize,
                    add_start_end=False,
                    multiClass=opt.multiClass,
                    keep_order=opt.testing,
                    enc_dec_focus=False,
                    device=opt.device)
                input_sens = data_reader.get_sen_minibatch(sen_feats,
                                                           train_data_index,
                                                           j,
                                                           opt.batchSize,
                                                           device=opt.device)
            else:
                inputs, tags, raw_tags, classes, raw_classes, lens = data_reader.get_minibatch_with_class(
                    data_feats,
                    data_tags,
                    data_class,
                    word_to_idx,
                    tag_to_idx,
                    class_to_idx,
                    data_index,
                    j,
                    opt.test_batchSize,
                    add_start_end=False,
                    multiClass=opt.multiClass,
                    keep_order=opt.testing,
                    enc_dec_focus=False,
                    device=opt.device)
                input_sens = data_reader.get_sen_minibatch(sen_feats,
                                                           data_index,
                                                           j,
                                                           opt.batchSize,
                                                           device=opt.device)

            if opt.crf:
                max_len = max(lens)
                masks = [([1] * l) + ([0] * (max_len - l)) for l in lens]
                masks = torch.tensor(masks,
                                     dtype=torch.uint8,
                                     device=opt.device)

                crf_feats, encoder_info = model_tag._get_lstm_features(
                    input_sens, lens, with_snt_classifier=True)
                tag_path_scores, tag_path = model_tag.forward(crf_feats, masks)
                tag_loss = model_tag.neg_log_likelihood(crf_feats, masks, tags)
                top_pred_slots = tag_path.data.cpu().numpy()
            else:
                tag_scores, encoder_info = model_tag(inputs,
                                                     lens,
                                                     with_snt_classifier=True)
                tag_loss = tag_loss_function(
                    tag_scores.contiguous().view(-1, len(tag_to_idx)),
                    tags.view(-1))
                top_pred_slots = tag_scores.data.cpu().numpy().argmax(axis=-1)
                # tags = tags.data.cpu().numpy()
            if opt.task_sc:
                class_scores = model_class(encoder_info_filter(encoder_info))
                class_loss = class_loss_function(class_scores, classes)
                if opt.multiClass:
                    snt_probs = class_scores.data.cpu().numpy()
                else:
                    snt_probs = class_scores.data.cpu().numpy().argmax(axis=-1)
                losses.append([
                    tag_loss.item() / sum(lens),
                    class_loss.item() / len(lens)
                ])
            else:
                losses.append([tag_loss.item() / sum(lens), 0])

            inputs = inputs.data.cpu().numpy()
            # classes = classes.data.cpu().numpy()
            for idx, pred_line in enumerate(top_pred_slots):
                length = lens[idx]
                pred_seq = [idx_to_tag[tag] for tag in pred_line][:length]
                lab_seq = [
                    idx_to_tag[tag] if type(tag) == int else tag
                    for tag in raw_tags[idx]
                ]
                pred_chunks = acc.get_chunks(['O'] + pred_seq + ['O'])
                label_chunks = acc.get_chunks(['O'] + lab_seq + ['O'])
                for pred_chunk in pred_chunks:
                    if pred_chunk in label_chunks:
                        TP += 1
                    else:
                        FP += 1
                for label_chunk in label_chunks:
                    if label_chunk not in pred_chunks:
                        FN += 1

                input_line = [idx_to_word[word]
                              for word in inputs[idx]][:length]
                word_tag_line = [
                    input_line[_idx] + ':' + lab_seq[_idx] + ':' +
                    pred_seq[_idx] for _idx in range(len(input_line))
                ]

                if opt.task_sc:
                    if opt.multiClass:
                        pred_classes = [
                            idx_to_class[i]
                            for i, p in enumerate(snt_probs[idx]) if p > 0.5
                        ]
                        gold_classes = [
                            idx_to_class[i] for i in raw_classes[idx]
                        ]
                        for pred_class in pred_classes:
                            if pred_class in gold_classes:
                                TP2 += 1
                            else:
                                FP2 += 1
                        for gold_class in gold_classes:
                            if gold_class not in pred_classes:
                                FN2 += 1
                        gold_class_str = ';'.join(gold_classes)
                        pred_class_str = ';'.join(pred_classes)
                    else:
                        pred_class = idx_to_class[snt_probs[idx]]
                        if type(raw_classes[idx]) == int:
                            gold_classes = {idx_to_class[raw_classes[idx]]}
                        else:
                            gold_classes = set(raw_classes[idx])
                        if pred_class in gold_classes:
                            TP2 += 1
                        else:
                            FP2 += 1
                            FN2 += 1
                        gold_class_str = ';'.join(list(gold_classes))
                        pred_class_str = pred_class
                else:
                    gold_class_str = ''
                    pred_class_str = ''

                if opt.testing:
                    f.write(
                        str(line_nums[idx]) + ' : ' + ' '.join(word_tag_line) +
                        ' <=> ' + gold_class_str + ' <=> ' + pred_class_str +
                        '\n')
                else:
                    f.write(' '.join(word_tag_line) + ' <=> ' +
                            gold_class_str + ' <=> ' + pred_class_str + '\n')

    if TP == 0:
        p, r, f = 0, 0, 0
    else:
        p, r, f = 100 * TP / (TP + FP), 100 * TP / (TP + FN), 100 * 2 * TP / (
            2 * TP + FN + FP)

    if TP2 == 0:
        cp, cr, cf = 0, 0, 0
    else:
        cp, cr, cf = 100 * TP2 / (TP2 + FP2), 100 * TP2 / (
            TP2 + FN2), 100 * 2 * TP2 / (2 * TP2 + FN2 + FP2)

    mean_losses = np.mean(losses, axis=0)
    return mean_losses, p, r, f, cp, cr, cf  # 0 if 2*TP2+FN2+FP2 == 0 else 100*2*TP2/(2*TP2+FN2+FP2)