示例#1
0
def load_from_splits(paths, original_test_filename, model_predicted_filename):
    sentence_potential_mistake_count = defaultdict(int)
    for path in paths:
        original_test = os.path.join(path, original_test_filename)
        model_predicted = os.path.join(path, model_predicted_filename)
        assert os.path.exists(original_test)
        assert os.path.exists(model_predicted)
        original_test = load_dataset_from_column(original_test)
        model_predicted = load_dataset_from_column(
            model_predicted,
            schema="none")  # since there may be invalid label sequences.
        for (original_sentence,
             original_labels), (model_sentence,
                                model_labels) in zip(original_test,
                                                     model_predicted):
            assert ' '.join(original_sentence) == ' '.join(model_sentence)
            if ' '.join(original_labels) != ' '.join(model_labels):
                sentence_potential_mistake_count[' '.join(
                    original_sentence)] += 1
    return sentence_potential_mistake_count
示例#2
0
def form_weighted_train_set(train_files, train_file_schema, eps,
                            mistake_count):
    for train_file in train_files:
        assert os.path.exists(train_file)
    train_set = []
    for train_file in train_files:
        train_set.extend(
            load_dataset_from_column(train_file, schema="train_file_schema"))

    weighted_train_set = []
    for sentence, labels in train_set:
        mistakes = mistake_count.get(' '.join(sentence), 0)
        weight = eps**mistakes
        weighted_train_set.append([sentence, labels, [weight] * len(labels)])
    return weighted_train_set