def load_from_splits(paths, original_test_filename, model_predicted_filename): sentence_potential_mistake_count = defaultdict(int) for path in paths: original_test = os.path.join(path, original_test_filename) model_predicted = os.path.join(path, model_predicted_filename) assert os.path.exists(original_test) assert os.path.exists(model_predicted) original_test = load_dataset_from_column(original_test) model_predicted = load_dataset_from_column( model_predicted, schema="none") # since there may be invalid label sequences. for (original_sentence, original_labels), (model_sentence, model_labels) in zip(original_test, model_predicted): assert ' '.join(original_sentence) == ' '.join(model_sentence) if ' '.join(original_labels) != ' '.join(model_labels): sentence_potential_mistake_count[' '.join( original_sentence)] += 1 return sentence_potential_mistake_count
def form_weighted_train_set(train_files, train_file_schema, eps, mistake_count): for train_file in train_files: assert os.path.exists(train_file) train_set = [] for train_file in train_files: train_set.extend( load_dataset_from_column(train_file, schema="train_file_schema")) weighted_train_set = [] for sentence, labels in train_set: mistakes = mistake_count.get(' '.join(sentence), 0) weight = eps**mistakes weighted_train_set.append([sentence, labels, [weight] * len(labels)]) return weighted_train_set