Exemplo n.º 1
0
def tri_out(filename, thr, output_file='test.txt', config_path='e93.yaml'):
    """
    [Usage]
    python3 -m algo.ensemble93 main -e mv --build-analysis
    """
    thr = int(thr)
    config_data = yaml.load(open(config_path))
    config = Config(data=config_data)

    votes_others = load_others_votes(config, [
        FINAL,
    ])
    votes_tri = load_tri_votes(config, [
        FINAL,
    ])
    labels = list()

    dataset = Processor.load_origin(filename)
    with open('out/otri.txt', 'w') as file_obj:
        for i, (d, v_others,
                v_tri) in enumerate(zip(dataset, votes_others, votes_tri)):
            label = d[-1]
            if label != 0:
                idx, max_value = argmax(v_tri)
                if label != idx and max_value >= thr:
                    new_label = label_str[idx]
                    file_obj.write(
                        '{}\t{}\t{}\t{}\t{} ({}->{}, {} {})\n'.format(
                            i, d[0], d[1], d[2], d[-1], label, new_label,
                            v_others, v_tri))
                    label = idx
            labels.append(label)
    export_final(output_file, labels)
def analyse_submit(filename):
    dataset = Processor.load_origin(filename)
    label_count = defaultdict(lambda: 0)
    for res in dataset:
        label = res[-1]
        label_count[label] += 1
    print(label_count)
Exemplo n.º 3
0
def eval_sub(input_filename):
    dataset = Processor.load_origin(input_filename)
    labels_predict = list(map(lambda _item: _item[-1], dataset))
    labels_gold = load_label_list(data_config.path(FINAL, LABEL))

    res = basic_evaluate(gold=labels_gold, pred=labels_predict)
    print_evaluation(res)
    for col in res[CONFUSION_MATRIX]:
        print(','.join(map(str, col)))
    print()
Exemplo n.º 4
0
def diff(a_filename, b_filename, output_filename, config_path='e93.yaml'):
    config_data = yaml.load(open(config_path))
    config = Config(data=config_data)

    votes = None

    for output_key in config.others:
        labels = list()
        for _mode in modes[FINAL]:
            path = data_config.output_path(output_key, _mode, LABEL_PREDICT)
            labels += load_label_list(path)

        if votes is None:
            n_sample = len(labels)
            votes = [0 for _ in range(n_sample)]

        for i, label in enumerate(labels):
            if label == 0:
                votes[i] += 1

    dataset = Processor.load_origin(a_filename)
    labels_a = list(map(lambda _item: _item[-1], dataset))

    dataset = Processor.load_origin(b_filename)
    labels_b = list(map(lambda _item: _item[-1], dataset))

    assert len(votes) == len(labels_a) == len(labels_b)

    n_match = 0
    with open(output_filename, 'w') as file_obj:
        for i, (a, b, d) in enumerate(zip(labels_a, labels_b, dataset)):
            if a == 3:
                if b == 0:
                    file_obj.write('{}\t{}\t{}\t{}\t{}->{} ({})\n'.format(
                        i, d[0], d[1], d[2], label_str[a], label_str[b],
                        votes[i]))
                else:
                    n_match += 1
    print(n_match)
Exemplo n.º 5
0
def export_error(filename):
    dataset = Processor.load_origin(filename)

    path = data_config.path(FINAL, LABEL)
    gold = load_label_list(path)
    wrong = defaultdict(lambda: defaultdict(lambda: list()))

    for g, sample in zip(gold, dataset):
        p = sample[-1]
        if p != g:
            wrong[g][p].append(sample[0] + ' | ' + sample[1] + ' | ' +
                               sample[2])

    for _g in range(4):
        for _p in range(4):
            print('{}->{}'.format(label_str[_g], label_str[_p]))
            for sample in wrong[_g][_p]:
                print('\t{}'.format(sample))
Exemplo n.º 6
0
def filter_by_others(input_filename,
                     output_filename,
                     thr,
                     config_path='e93.yaml'):
    thr = int(thr)
    config_data = yaml.load(open(config_path))
    config = Config(data=config_data)

    votes = None

    for output_key in config.others:
        labels = list()
        for _mode in modes[FINAL]:
            path = data_config.output_path(output_key, _mode, LABEL_PREDICT)
            labels += load_label_list(path)

        if votes is None:
            n_sample = len(labels)
            votes = [0 for _ in range(n_sample)]

        for i, label in enumerate(labels):
            if label == 0:
                votes[i] += 1

    dataset = Processor.load_origin(input_filename)
    labels = list(map(lambda _item: _item[-1], dataset))

    assert len(votes) == len(labels)

    with open(output_filename, 'w') as file_obj:
        for i, (p, d) in enumerate(zip(labels, dataset)):
            if p != 0 and votes[i] >= thr:
                file_obj.write('{}\t{}\t{}\t{}\t{} ({})\n'.format(
                    i, d[0], d[1], d[2], p, votes[i]))
                labels[i] = 0
    export_final('test.txt', labels)
def test_submit(filename_pred, filename_gold):
    pred = map(lambda _item: _item[-1], Processor.load_origin(filename_pred))
    gold = map(lambda _item: _item[-1], Processor.load_origin(filename_gold))
    res = basic_evaluate(pred, gold)
    print_evaluation(res)
def build_basic():
    config.prepare_data_folder()
    # TRAIN
    labels = list()
    text_turns = [[] for _ in range(3)]
    for turn_1, turn_2, turn_3, label_idx in Processor.load_origin_train():
        turns = [turn_1, turn_2, turn_3]
        for i, r in enumerate(turns):
            text_turns[i].append(r)
        labels.append(label_idx)

    for i, texts in enumerate(text_turns):
        path = config.path(TRAIN, 'turn', str(i))
        open(path, 'w').write('\n'.join(texts) + '\n')

    path = config.path(TRAIN, LABEL)
    open(path, 'w').write('\n'.join(map(str, labels)) + '\n')

    binary_labels = [0 if label == 0 else 1 for label in labels]
    path = config.path(TRAIN, LABEL, 'binary')
    open(path, 'w').write('\n'.join(map(str, binary_labels)) + '\n')

    # TEST
    labels = list()
    text_turns = [[] for _ in range(3)]
    for turn_1, turn_2, turn_3, label_idx in Processor.load_origin_dev():
        turns = [turn_1, turn_2, turn_3]
        for i, r in enumerate(turns):
            text_turns[i].append(r)
        labels.append(label_idx)

    for i, texts in enumerate(text_turns):
        path = config.path(TEST, 'turn', str(i))
        open(path, 'w').write('\n'.join(texts) + '\n')

    path = config.path(TEST, LABEL)
    open(path, 'w').write('\n'.join(map(str, labels)) + '\n')

    binary_labels = [0 if label == 0 else 1 for label in labels]
    path = config.path(TEST, LABEL, 'binary')
    open(path, 'w').write('\n'.join(map(str, binary_labels)) + '\n')

    # FINAL
    labels = list()
    text_turns = [[] for _ in range(3)]
    for turn_1, turn_2, turn_3, label_idx in Processor.load_origin_test():
        turns = [turn_1, turn_2, turn_3]
        for i, r in enumerate(turns):
            text_turns[i].append(r)
        labels.append(label_idx)

    for i, texts in enumerate(text_turns):
        path = config.path(FINAL, 'turn', str(i))
        open(path, 'w').write('\n'.join(texts) + '\n')

    path = config.path(FINAL, LABEL)
    open(path, 'w').write('\n'.join(map(str, labels)) + '\n')

    binary_labels = [0 if label == 0 else 1 for label in labels]
    path = config.path(FINAL, LABEL, 'binary')
    open(path, 'w').write('\n'.join(map(str, binary_labels)) + '\n')
Exemplo n.º 9
0
def main(input_filename, config_path='e93.yaml', final_output=None):
    """
    [Usage]
    python3 -m algo.ensemble93 main -e mv --build-analysis
    """
    config_data = yaml.load(open(config_path))
    config = Config(data=config_data)

    labels_gold = dict()
    labels_predict = dict()
    labels_predict_last = dict()

    dataset = Processor.load_origin(input_filename)
    labels_predict[FINAL] = list(map(lambda _item: _item[-1], dataset))

    for mode in [
            FINAL,
    ]:
        if not mode == FINAL:
            res = basic_evaluate(gold=labels_gold[mode],
                                 pred=labels_predict[mode])
            print(mode)
            print_evaluation(res)
            for col in res[CONFUSION_MATRIX]:
                print(','.join(map(str, col)))
            print()

        n_sample = len(labels_predict[mode])
        labels_predict_last[mode] = labels_predict[mode]

        # 修正HAS
        if config.tri_enabled:
            n_changed = 0

            votes = [[0 for _ in range(4)] for _ in range(n_sample)]
            for output_key in config.tri:
                labels = list()
                for _mode in modes[mode]:
                    path = data_config.output_path(output_key, _mode,
                                                   LABEL_PREDICT)
                    labels += load_label_list(path)
                if len(labels) != n_sample:
                    raise Exception('mismatch {}({}) != {}'.format(
                        output_key, len(labels), n_sample))

                for i, label in enumerate(labels):
                    votes[i][label] += 1

            base = list() + labels_predict_last[mode]
            for i, vote in enumerate(votes):
                arg_max = int(np.argmax(vote))
                if arg_max == 0:
                    continue
                if base[i] != 0:
                    if vote[arg_max] >= config.tri_min_vote:
                        if base[i] != arg_max:
                            n_changed += 1
                        base[i] = arg_max
                elif vote[arg_max] >= config.tri_out_vote:
                    base[i] = arg_max
                    n_changed += 1

            print('n_exchanged within "HAS": {}'.format(n_changed))

            labels_predict_last[mode] = base
            if not mode == FINAL:
                res = basic_evaluate(gold=labels_gold[mode], pred=base)
                print(mode, '(after TRI)')
                print_evaluation(res)
                for col in res[CONFUSION_MATRIX]:
                    print(','.join(map(str, col)))
                print()

        # 将判成HAS的样本修正为Others
        if config.others_enabled:
            votes = [0 for _ in range(n_sample)]
            n_changed = 0

            for output_key in config.others:
                labels = list()
                for _mode in modes[mode]:
                    path = data_config.output_path(output_key, _mode,
                                                   LABEL_PREDICT)
                    labels += load_label_list(path)
                if len(labels) != n_sample:
                    raise Exception('mismatch {}({}) != {}'.format(
                        output_key, len(labels), n_sample))

                for i, label in enumerate(labels):
                    if label == 0:
                        votes[i] += 1
            if config.others_min_vote == 'all':
                min_vote = len(config.others)
            else:
                min_vote = int(config.others_min_vote)
            base = list() + labels_predict_last[mode]
            for i, vote in enumerate(votes):
                if vote >= min_vote:
                    if base[i] != 0:
                        n_changed += 1
                    base[i] = 0
            print('n_changed to "OTHERS": {}'.format(n_changed))

            labels_predict_last[mode] = base
            if not mode == FINAL:
                res = basic_evaluate(gold=labels_gold[mode], pred=base)
                print(mode, '(after OTHERS)')
                print_evaluation(res)
                for col in res[CONFUSION_MATRIX]:
                    print(','.join(map(str, col)))
                print()

        if mode == FINAL and final_output is not None:
            labels = labels_predict_last[FINAL]
            export_final(final_output, labels)