示例#1
0
    def __init__(self, packer, l_map, score_type):
        eval_batch.__init__(self, packer, l_map)

        self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'],
                                    l_map['<pad>'])

        if 'f' in score_type:
            self.eval_b = self.calc_f1_batch
            self.calc_s = self.f1_score
        else:
            self.eval_b = self.calc_acc_batch
            self.calc_s = self.acc_score
示例#2
0
 def __init__(self,
              if_cuda,
              f_map,
              l_map,
              pad_word,
              pad_label,
              start_label,
              label_seq=True,
              batch_size=50,
              caseless=True):
     predict.__init__(self, if_cuda, l_map, label_seq, batch_size)
     self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
     self.pad_word = pad_word
     self.f_map = f_map
     self.l_map = l_map
     self.caseless = caseless
示例#3
0
class eval_wc(eval_batch):
    """evaluation class for LM-LSTM-CRF

    args: 
        packer: provide method to convert target into original space [TODO: need to improve]
        l_map: dictionary for labels
        score_type: use f1score with using 'f'

    """
    def __init__(self, packer, l_map, score_type):
        eval_batch.__init__(self, packer, l_map)

        self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'],
                                    l_map['<pad>'])

        if 'f' in score_type:
            self.eval_b = self.calc_f1_batch
            self.calc_s = self.f1_score
        else:
            self.eval_b = self.calc_acc_batch
            self.calc_s = self.acc_score

    def calc_score(self, ner_model, dataset_loader, file_no):
        """
        calculate score for pre-selected metrics

        args: 
            ner_model: LM-LSTM-CRF model
            dataset_loader: loader class for test set
        """
        ner_model.eval()
        self.reset()

        for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v in itertools.chain.from_iterable(
                dataset_loader):
            f_f, f_p, b_f, b_p, w_f, _, mask_v = self.packer.repack_vb(
                f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v)
            scores = ner_model(f_f, f_p, b_f, b_p, w_f, file_no)
            # print("size scores:", scores.data.size(), " size mask:", mask_v.data.size())
            decoded = self.decoder.decode(scores.data, mask_v.data)

            self.eval_b(decoded, tg)

        return self.calc_s()
示例#4
0
class eval_w(eval_batch):
    """evaluation class for word level model (LSTM-CRF)

    args: 
        packer: provide method to convert target into original space [TODO: need to improve]
        l_map: dictionary for labels
        score_type: use f1score with using 'f'

    """
    def __init__(self, packer, l_map, score_type):
        eval_batch.__init__(self, packer, l_map)

        self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'],
                                    l_map['<pad>'])

        if 'f' in score_type:
            self.eval_b = self.calc_f1_batch
            self.calc_s = self.f1_score
        else:
            self.eval_b = self.calc_acc_batch
            self.calc_s = self.acc_score

    def calc_score(self, ner_model, dataset_loader):
        """
        calculate score for pre-selected metrics

        args: 
            ner_model: LSTM-CRF model
            dataset_loader: loader class for test set
        """
        ner_model.eval()
        self.reset()

        for feature, tg, mask in itertools.chain.from_iterable(dataset_loader):
            fea_v, _, mask_v = self.packer.repack_vb(feature, tg, mask)
            scores, _ = ner_model(fea_v)
            decoded = self.decoder.decode(scores.data, mask_v.data)

            self.eval_b(decoded, tg)

        return self.calc_s()
示例#5
0
class predict_wc(predict):
    """prediction class for LM-LSTM-CRF

    args: 
        if_cuda: if use cuda to speed up 
        f_map: dictionary for words
        c_map: dictionary for chars
        l_map: dictionary for labels
        pad_word: word padding
        pad_char: word padding
        pad_label: label padding
        start_label: start label 
        label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
        batch_size: size of batch in decoding
        caseless: caseless or not
    """
    def __init__(self,
                 if_cuda,
                 f_map,
                 c_map,
                 l_map,
                 pad_word,
                 pad_char,
                 pad_label,
                 start_label,
                 label_seq=True,
                 batch_size=50,
                 caseless=True):
        predict.__init__(self, if_cuda, l_map, label_seq, batch_size)
        self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
        self.pad_word = pad_word
        self.pad_char = pad_char
        self.f_map = f_map
        self.c_map = c_map
        self.l_map = l_map
        self.caseless = caseless

    def apply_model(self, ner_model, features, file_no):
        """
        apply_model function for LM-LSTM-CRF

        args:
            ner_model: sequence labeling model
            feature (list): list of words list
        """
        char_features = encode2char_safe(features, self.c_map)

        if self.caseless:
            word_features = encode_safe(
                list(map(lambda t: list(map(lambda x: x.lower(), t)),
                         features)), self.f_map, self.f_map['<unk>'])
        else:
            word_features = encode_safe(features, self.f_map,
                                        self.f_map['<unk>'])

        fea_len = [list(map(lambda t: len(t) + 1, f)) for f in char_features]
        forw_features = concatChar(char_features, self.c_map)

        word_len = max(map(lambda t: len(t) + 1, word_features))
        char_len = max(
            map(lambda t: len(t[0]) + word_len - len(t[1]),
                zip(forw_features, word_features)))
        forw_t = list(
            map(lambda t: t + [self.pad_char] * (char_len - len(t)),
                forw_features))
        back_t = torch.LongTensor(list(map(lambda t: t[::-1], forw_t)))
        forw_t = torch.LongTensor(forw_t)
        forw_p = torch.LongTensor(
            list(
                map(
                    lambda t: list(
                        itertools.accumulate(t + [1] * (word_len - len(t)))),
                    fea_len)))
        back_p = torch.LongTensor(
            list(
                map(
                    lambda t: [char_len - 1] +
                    [char_len - 1 - tup for tup in t[:-1]], forw_p)))

        masks = torch.ByteTensor(
            list(
                map(
                    lambda t: [1] * (len(t) + 1) + [0] *
                    (word_len - len(t) - 1), word_features)))
        word_t = torch.LongTensor(
            list(
                map(lambda t: t + [self.pad_word] * (word_len - len(t)),
                    word_features)))

        if self.if_cuda:
            f_f = autograd.Variable(forw_t.transpose(0, 1)).cuda()
            f_p = autograd.Variable(forw_p.transpose(0, 1)).cuda()
            b_f = autograd.Variable(back_t.transpose(0, 1)).cuda()
            b_p = autograd.Variable(back_p.transpose(0, 1)).cuda()
            w_f = autograd.Variable(word_t.transpose(0, 1)).cuda()
            mask_v = masks.transpose(0, 1).cuda()
        else:
            f_f = autograd.Variable(forw_t.transpose(0, 1))
            f_p = autograd.Variable(forw_p.transpose(0, 1))
            b_f = autograd.Variable(back_t.transpose(0, 1))
            b_p = autograd.Variable(back_p.transpose(0, 1))
            w_f = autograd.Variable(word_t.transpose(0, 1))
            mask_v = masks.transpose(0, 1)

        scores = ner_model(f_f, f_p, b_f, b_p, w_f, file_no)
        decoded = self.decoder.decode(scores.data, mask_v)

        return decoded
示例#6
0
class predict_w(predict):
    """prediction class for word level model (LSTM-CRF)

    args: 
        if_cuda: if use cuda to speed up 
        f_map: dictionary for words
        l_map: dictionary for labels
        pad_word: word padding
        pad_label: label padding
        start_label: start label 
        label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
        batch_size: size of batch in decoding
        caseless: caseless or not
    """
    def __init__(self,
                 if_cuda,
                 f_map,
                 l_map,
                 pad_word,
                 pad_label,
                 start_label,
                 label_seq=True,
                 batch_size=50,
                 caseless=True):
        predict.__init__(self, if_cuda, l_map, label_seq, batch_size)
        self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
        self.pad_word = pad_word
        self.f_map = f_map
        self.l_map = l_map
        self.caseless = caseless

    def apply_model(self, ner_model, features):
        """
        apply_model function for LSTM-CRF

        args:
            ner_model: sequence labeling model
            feature (list): list of words list
        """
        if self.caseless:
            features = list(
                map(lambda t: list(map(lambda x: x.lower(), t)), features))
        features = encode_safe(features, self.f_map, self.f_map['<unk>'])
        f_len = max(map(lambda t: len(t) + 1, features))

        masks = torch.ByteTensor(
            list(
                map(lambda t: [1] * (len(t) + 1) + [0] * (f_len - len(t) - 1),
                    features)))
        word_features = torch.LongTensor(
            list(
                map(lambda t: t + [self.pad_word] * (f_len - len(t)),
                    features)))

        if self.if_cuda:
            fea_v = autograd.Variable(word_features.transpose(0, 1)).cuda()
            mask_v = masks.transpose(0, 1).cuda()
        else:
            fea_v = autograd.Variable(word_features.transpose(0, 1))
            mask_v = masks.transpose(0, 1).contiguous()

        scores, _ = ner_model(fea_v)
        decoded = self.decoder.decode(scores.data, mask_v)

        return decoded