def __init__(self, packer, l_map, score_type): eval_batch.__init__(self, packer, l_map) self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'], l_map['<pad>']) if 'f' in score_type: self.eval_b = self.calc_f1_batch self.calc_s = self.f1_score else: self.eval_b = self.calc_acc_batch self.calc_s = self.acc_score
def __init__(self, if_cuda, f_map, l_map, pad_word, pad_label, start_label, label_seq=True, batch_size=50, caseless=True): predict.__init__(self, if_cuda, l_map, label_seq, batch_size) self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label) self.pad_word = pad_word self.f_map = f_map self.l_map = l_map self.caseless = caseless
class eval_wc(eval_batch): """evaluation class for LM-LSTM-CRF args: packer: provide method to convert target into original space [TODO: need to improve] l_map: dictionary for labels score_type: use f1score with using 'f' """ def __init__(self, packer, l_map, score_type): eval_batch.__init__(self, packer, l_map) self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'], l_map['<pad>']) if 'f' in score_type: self.eval_b = self.calc_f1_batch self.calc_s = self.f1_score else: self.eval_b = self.calc_acc_batch self.calc_s = self.acc_score def calc_score(self, ner_model, dataset_loader, file_no): """ calculate score for pre-selected metrics args: ner_model: LM-LSTM-CRF model dataset_loader: loader class for test set """ ner_model.eval() self.reset() for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v in itertools.chain.from_iterable( dataset_loader): f_f, f_p, b_f, b_p, w_f, _, mask_v = self.packer.repack_vb( f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v) scores = ner_model(f_f, f_p, b_f, b_p, w_f, file_no) # print("size scores:", scores.data.size(), " size mask:", mask_v.data.size()) decoded = self.decoder.decode(scores.data, mask_v.data) self.eval_b(decoded, tg) return self.calc_s()
class eval_w(eval_batch): """evaluation class for word level model (LSTM-CRF) args: packer: provide method to convert target into original space [TODO: need to improve] l_map: dictionary for labels score_type: use f1score with using 'f' """ def __init__(self, packer, l_map, score_type): eval_batch.__init__(self, packer, l_map) self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'], l_map['<pad>']) if 'f' in score_type: self.eval_b = self.calc_f1_batch self.calc_s = self.f1_score else: self.eval_b = self.calc_acc_batch self.calc_s = self.acc_score def calc_score(self, ner_model, dataset_loader): """ calculate score for pre-selected metrics args: ner_model: LSTM-CRF model dataset_loader: loader class for test set """ ner_model.eval() self.reset() for feature, tg, mask in itertools.chain.from_iterable(dataset_loader): fea_v, _, mask_v = self.packer.repack_vb(feature, tg, mask) scores, _ = ner_model(fea_v) decoded = self.decoder.decode(scores.data, mask_v.data) self.eval_b(decoded, tg) return self.calc_s()
class predict_wc(predict): """prediction class for LM-LSTM-CRF args: if_cuda: if use cuda to speed up f_map: dictionary for words c_map: dictionary for chars l_map: dictionary for labels pad_word: word padding pad_char: word padding pad_label: label padding start_label: start label label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test batch_size: size of batch in decoding caseless: caseless or not """ def __init__(self, if_cuda, f_map, c_map, l_map, pad_word, pad_char, pad_label, start_label, label_seq=True, batch_size=50, caseless=True): predict.__init__(self, if_cuda, l_map, label_seq, batch_size) self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label) self.pad_word = pad_word self.pad_char = pad_char self.f_map = f_map self.c_map = c_map self.l_map = l_map self.caseless = caseless def apply_model(self, ner_model, features, file_no): """ apply_model function for LM-LSTM-CRF args: ner_model: sequence labeling model feature (list): list of words list """ char_features = encode2char_safe(features, self.c_map) if self.caseless: word_features = encode_safe( list(map(lambda t: list(map(lambda x: x.lower(), t)), features)), self.f_map, self.f_map['<unk>']) else: word_features = encode_safe(features, self.f_map, self.f_map['<unk>']) fea_len = [list(map(lambda t: len(t) + 1, f)) for f in char_features] forw_features = concatChar(char_features, self.c_map) word_len = max(map(lambda t: len(t) + 1, word_features)) char_len = max( map(lambda t: len(t[0]) + word_len - len(t[1]), zip(forw_features, word_features))) forw_t = list( map(lambda t: t + [self.pad_char] * (char_len - len(t)), forw_features)) back_t = torch.LongTensor(list(map(lambda t: t[::-1], forw_t))) forw_t = torch.LongTensor(forw_t) forw_p = torch.LongTensor( list( map( lambda t: list( itertools.accumulate(t + [1] * (word_len - len(t)))), fea_len))) back_p = torch.LongTensor( list( map( lambda t: [char_len - 1] + [char_len - 1 - tup for tup in t[:-1]], forw_p))) masks = torch.ByteTensor( list( map( lambda t: [1] * (len(t) + 1) + [0] * (word_len - len(t) - 1), word_features))) word_t = torch.LongTensor( list( map(lambda t: t + [self.pad_word] * (word_len - len(t)), word_features))) if self.if_cuda: f_f = autograd.Variable(forw_t.transpose(0, 1)).cuda() f_p = autograd.Variable(forw_p.transpose(0, 1)).cuda() b_f = autograd.Variable(back_t.transpose(0, 1)).cuda() b_p = autograd.Variable(back_p.transpose(0, 1)).cuda() w_f = autograd.Variable(word_t.transpose(0, 1)).cuda() mask_v = masks.transpose(0, 1).cuda() else: f_f = autograd.Variable(forw_t.transpose(0, 1)) f_p = autograd.Variable(forw_p.transpose(0, 1)) b_f = autograd.Variable(back_t.transpose(0, 1)) b_p = autograd.Variable(back_p.transpose(0, 1)) w_f = autograd.Variable(word_t.transpose(0, 1)) mask_v = masks.transpose(0, 1) scores = ner_model(f_f, f_p, b_f, b_p, w_f, file_no) decoded = self.decoder.decode(scores.data, mask_v) return decoded
class predict_w(predict): """prediction class for word level model (LSTM-CRF) args: if_cuda: if use cuda to speed up f_map: dictionary for words l_map: dictionary for labels pad_word: word padding pad_label: label padding start_label: start label label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test batch_size: size of batch in decoding caseless: caseless or not """ def __init__(self, if_cuda, f_map, l_map, pad_word, pad_label, start_label, label_seq=True, batch_size=50, caseless=True): predict.__init__(self, if_cuda, l_map, label_seq, batch_size) self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label) self.pad_word = pad_word self.f_map = f_map self.l_map = l_map self.caseless = caseless def apply_model(self, ner_model, features): """ apply_model function for LSTM-CRF args: ner_model: sequence labeling model feature (list): list of words list """ if self.caseless: features = list( map(lambda t: list(map(lambda x: x.lower(), t)), features)) features = encode_safe(features, self.f_map, self.f_map['<unk>']) f_len = max(map(lambda t: len(t) + 1, features)) masks = torch.ByteTensor( list( map(lambda t: [1] * (len(t) + 1) + [0] * (f_len - len(t) - 1), features))) word_features = torch.LongTensor( list( map(lambda t: t + [self.pad_word] * (f_len - len(t)), features))) if self.if_cuda: fea_v = autograd.Variable(word_features.transpose(0, 1)).cuda() mask_v = masks.transpose(0, 1).cuda() else: fea_v = autograd.Variable(word_features.transpose(0, 1)) mask_v = masks.transpose(0, 1).contiguous() scores, _ = ner_model(fea_v) decoded = self.decoder.decode(scores.data, mask_v) return decoded