def load_data(self): if self.filename.endswith('.json'): data = read_json(self.filename) elif self.filename.endswith('.jsonl'): data = read_jsonline(self.filename) else: data = read_json(self.filename) self.logger.warning('Your file suffix is not json or jsonl') return data
def __transform(self): sents = [] labels = [] seq_lengths = [] input_sents = read_json(self.__filename) random.shuffle(input_sents) random.shuffle(input_sents) random.shuffle(input_sents) random.shuffle(input_sents) random.shuffle(input_sents) for sent in input_sents: sent_words = [t['text'] for t in sent['tokens']] sent_labels = sent['labels'] mapped_words = [self.__word2id_mapper[word] for word in sent_words] mapped_labels = [ self.__label2id_mapper[label] for label in sent_labels ] if len(mapped_words) >= self.__sent_padding_length: mapped_words = mapped_words[:self.__sent_padding_length] mapped_labels = mapped_labels[:self.__sent_padding_length] else: pad_idx = self.__word2id_mapper[BATCH_PAD] mapped_words += [pad_idx] * (self.__sent_padding_length - len(sent_words)) mapped_labels += [0] * (self.__sent_padding_length - len(sent_labels)) if self.__is_skip_window: sents.append(self.__indices2index_windows(mapped_words)) else: sents.append(mapped_words) labels.append(mapped_labels) seq_lengths.append(len(sent_labels)) self.__sent_count += 1 return np.array(sents), np.array(labels), np.array(seq_lengths)
def evaluate(self, pred_filename): pred_data = read_json(pred_filename) kfold_counter = {} for i in range(self.k): single_fold_pred_data = pred_data[i] ret = self.__evaluators[i].evaluate(single_fold_pred_data, is_percentage=False) for metrics, e_counter in ret.items(): for e_type, val in e_counter.items(): kfold_counter[metrics + '-' + e_type][str(i)] = val kfold_counter = pd.DataFrame(kfold_counter) counter_sum = kfold_counter.sum(axis=1) macro_row = {} micro_row = {} for e in self.__entity_types: micro_precision = counter_sum['true_positive_count-' + e] / counter_sum['pred_count-' + e] micro_recall = counter_sum['true_positive_count-' + e] / counter_sum['true_count-' + e] micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) micro_row['precision-' + e] = micro_precision micro_row['recall-' + e] = micro_recall micro_row['f1-' + e] = micro_f1 macro_row['precision-' + e] = counter_sum['precision-' + e] / self.k macro_row['recall-' + e] = counter_sum['recall-' + e] / self.k macro_row['f1-' + e] = counter_sum['f1-' + e] / self.k kfold_counter.append(macro_row) kfold_counter.append(micro_row) for e in self.__entity_types: col_names = ['true_positive_count-' + e, 'pred_count-' + e] kfold_counter.drop(col_names, axis='columns', inplace=True) return kfold_counter
def __get_data(self, data): if data is None: return data elif isinstance(data, str): return read_json(data) elif isinstance(data, list): return data else: raise TypeError( 'input data type {} is invalid in evaluation'.format( type(data)))
def __init__(self, k, true_filename, entity_types=ENTITY_TYPES): self.k = k self.__entity_types = entity_types evaluators = [] true_data = read_json(true_filename) if k != len(true_data): raise ValueError('k and true data does not correspond.') for single_fold_true_data in true_data: evaluator = EntityEvaluator(single_fold_true_data, entity_types=entity_types) evaluators.append(evaluator) self.__evaluators = evaluators
def from_json(cls, filename): return cls(**read_json(filename))