示例#1
0
def filter_labels(data, labels, keep_labels):
    """filter a dataset by label"""

    grouped = dict(ml.group_by_label(data, labels))

    grouped_filtered = {x: y for x, y in grouped.items() if x in keep_labels}
    return zip(*[(y, x[0]) for x in grouped_filtered.items() for y in x[1]])
示例#2
0
def visualize_training_data(data_train, labels_train, color_to_gray):
    """visualize training data"""

    for cur_label, group in ml.group_by_label(data_train, labels_train):
        print("label:", cur_label)
        group_prepped = [(color_to_gray(x), None) for x in group]
        group_pred = [Sample(x, cur_label, 0.0, False) for x in group_prepped]
        _ = label_chars(group_pred)
示例#3
0
    def test_balance(self):
        """test dataset balancing functionality"""

        data, labels = zip(*([(0.0, "a")] * 4 + [(1.0, "b")] * 1))

        balanced_data, balanced_labels = ml.balance(data, labels, 0.5,
                                                    lambda x: x)
        balanced_grouped = dict(
            ml.group_by_label(balanced_data, balanced_labels))
        for label, group in balanced_grouped.items():
            self.assertEqual(len(group), 2)

        balanced_data, balanced_labels = ml.balance(data, labels, 8,
                                                    lambda x: x)
        balanced_grouped = dict(
            ml.group_by_label(balanced_data, balanced_labels))
        for label, group in balanced_grouped.items():
            self.assertEqual(len(group), 8)
示例#4
0
def main(argv):
    """main program"""

    if len(argv) < 2:
        mode = MODE_TUNE
    else:
        mode = argv[1]

    if len(argv) < 3:
        config = CONFIG_DEFAULT
    else:
        config = load_config(argv[2])

    if len(argv) < 4:
        model_filename = "models/classify_charpos.pkl"
    else:
        model_filename = argv[3]

    print("run_charposml")
    print("---------------")
    cf.pretty_print(config)
    print("mode:", mode)
    print("model filename:", model_filename)

    torch.manual_seed(0)

    thresh_true = 0.5

    pad_image = partial(improc.pad_image,
                        width=config.half_width * 2,
                        height=config.pad_height)
    augment_func = pipe(
        pad_image,  # pad before transformations
        partial(improc.transform_random,
                trans_size=[config.trans_x_size, config.trans_y_size],
                rot_size=config.rot_size,
                scale_size=config.scale_size))

    filenames_train = data.pages(config.train.idxs)
    filenames_dev = data.pages(config.dev.idxs)
    filenames_test = data.pages(config.test.idxs)

    # for integration testing
    # filenames_train = data.pages([5, 6, 7])
    # filenames_dev = data.pages([8])
    # filenames_test = data.pages([9])

    print("loading and preparing datasets...")

    print("train files:", filenames_train)
    data_train_raw, labels_train_raw = _load_samples(filenames_train,
                                                     config.half_width,
                                                     config.offset)
    data_train, labels_train = dataset.prepare(
        data_train_raw, labels_train_raw, config.train.do_subsample,
        config.train.subsample_size, config.train.do_prep_balance,
        config.train.do_balance, config.train.balance_size,
        config.train.do_augment, config.train.augment_size, augment_func)

    print("dev files:", filenames_dev)
    data_dev_raw, labels_dev_raw = _load_samples(filenames_dev,
                                                 config.half_width,
                                                 config.offset)
    data_dev, labels_dev = dataset.prepare(
        data_dev_raw, labels_dev_raw, config.dev.do_subsample,
        config.dev.subsample_size, config.dev.do_prep_balance,
        config.dev.do_balance, config.dev.balance_size, config.dev.do_augment,
        config.dev.augment_size, augment_func)

    print("test files:", filenames_test)
    data_test_raw, labels_test_raw = _load_samples(filenames_test,
                                                   config.half_width,
                                                   config.offset)
    data_test, labels_test = dataset.prepare(
        data_test_raw, labels_test_raw, config.test.do_subsample,
        config.test.subsample_size, config.test.do_prep_balance,
        config.test.do_balance, config.test.balance_size,
        config.test.do_augment, config.test.augment_size, augment_func)

    print("done")

    print("train data size:", util.mbs(data_train), "MiB")
    print("dev data size:  ", util.mbs(data_dev), "MiB")
    print("test data size: ", util.mbs(data_test), "MiB")
    print("train count:    ", len(data_train))
    print("dev count:      ", len(data_dev))
    print("test count:     ", len(data_test))
    print()

    counts_train = ml.label_counts(labels_train)
    print("train group sizes:", counts_train[0])
    print()
    counts_dev = ml.label_counts(labels_dev)
    print("dev group sizes:", counts_dev[0])
    print()
    counts_test = ml.label_counts(labels_test)
    print("test group sizes:", counts_test[0])
    print()

    # print("training group sizes change in balancing:")
    # for x, y in train_unbalanced_counts[0]:
    #     count = train_counts[1].get(x, 0)
    #     print(x, round(count / y, 3))
    # print()

    print("discarding letter information from labels")
    print()
    labels_train = [x[0] for x in labels_train]
    labels_dev = [x[0] for x in labels_dev]
    labels_test = [x[0] for x in labels_test]

    counts_train = ml.label_counts(labels_train)
    print("train group sizes:", counts_train[0])
    print()
    counts_dev = ml.label_counts(labels_dev)
    print("dev group sizes:", counts_dev[0])
    print()
    counts_test = ml.label_counts(labels_test)
    print("test group sizes:", counts_test[0])
    print()

    extract_char = improc.extract_pos

    if mode == MODE_TRAIN:

        print("training model...")

        # word_ims_test, char_poss_test = _load_words(test_filenames)
        # distance_test = build_distance_test(word_ims_test, char_poss_test)
        distance_test = lambda x, y: 0.0

        if True:
            # train a CNN

            def build_find_prob(img_to_prob):
                return lambda word_im: findletters.find_prob(
                    word_im, config.half_width, extract_char, img_to_prob,
                    thresh_true)

            prepare_callback = build_prepare_validation_callback(
                data_dev, labels_dev, build_find_prob, distance_test)

            proc = imml.build_classification_process_cnn(
                data_train,
                labels_train,
                config.half_width *
                2,  # - 8 # I forget the reason I was doing this
                config.pad_height,
                config.start_row,
                do_align=False,
                nn_arch=config.nn_arch,
                nn_opt=config.nn_opt,
                epoch_log_filename=model_filename + ".log.txt",
                prepare_callback=prepare_callback,
                save_model_filename=model_filename + ".wip",
                tsv_filename=model_filename + ".status")

        else:
            # traditional ML
            proc = imml.build_classification_process_charpos(
                data_train,
                labels_train,
                config.half_width * 2,  # - 8,
                config.pad_height,
                config.start_row)

        classify_char_pos, prep_image, feat_extractor, classifier = proc

        print("done")

        # summarize results

        feats_test = [feat_extractor(x) for x in data_test]
        labels_test_pred = [classifier(x) for x in feats_test]

        print("accuracy score on test dataset:",
              sklearn.metrics.accuracy_score(labels_test, labels_test_pred))

        print("confusion matrix:")
        print(
            sklearn.metrics.confusion_matrix(labels_test, labels_test_pred,
                                             [True, False]))

        # save model
        util.save_dill(proc, model_filename)

    if mode == MODE_TUNE:

        # load model
        proc = util.load_dill(model_filename)
        classify_char_pos, prep_image, feat_extractor, classifier = proc

        # predict on test data
        feats_test = [feat_extractor(x) for x in data_test]
        labels_test_pred = [classifier(x) for x in feats_test]

        # visualize boolean predictions
        if VISUALIZE:
            idx = 0
            slices_per_im = 320
            while idx < len(data_test):
                disp_im = _visualize_boolean_predictions(
                    data_test[idx:(idx + slices_per_im)],
                    labels_test[idx:(idx + slices_per_im)],
                    labels_test_pred[idx:(idx + slices_per_im)],
                    config.half_width, config.offset)
                cv2.namedWindow("boolean predictions", cv2.WINDOW_NORMAL)
                cv2.imshow("boolean predictions", disp_im)
                cv2.waitKey()
                idx = idx + slices_per_im

        # calculate and visualize ROC AUC
        if False and VISUALIZE:
            # distances_test = model.decision_function(feats_test)
            # distances_test = classifier.model.predict_proba(feats_test)[:, 1]
            distances_test = [
                classifier.predict_proba(x)[0, 1] for x in feats_test
            ]
            fpr, tpr, _ = sklearn.metrics.roc_curve(labels_test,
                                                    distances_test)
            roc_auc = sklearn.metrics.auc(fpr, tpr)
            print("ROC AUC on test dataset:", roc_auc)

            if VISUALIZE:
                # visualize ROC curve
                from matplotlib import pyplot as plt
                plt.figure()
                plt.plot(fpr,
                         tpr,
                         color="red",
                         lw=2,
                         label="ROC curve (area = " + str(roc_auc) + ")")
                plt.plot([0, 1], [0, 1], color="blue", lw=2, linestyle='--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel("FPR")
                plt.ylabel("TPR")
                plt.title("ROC")
                plt.legend(loc="lower right")
                plt.show()

        if False and VISUALIZE:
            # visualize result images

            # labels_test_pred = classify_char_pos(data_test)
            chars_confirmed = []
            chars_redo = []

            # show results
            for cur_label, group in ml.group_by_label(data_test,
                                                      labels_test_pred):
                print(cur_label)
                group_prepped = [(prep_image(x), None) for x in group]
                group_pred = [
                    Sample(x, cur_label, 0.0, False) for x in group_prepped
                ]
                chars_working, chars_done = charclass.label_chars(group_pred)
                chars_confirmed += chars_working
                chars_redo += chars_done

        # test different position finding methods using a distance function
        # on each word

        print("loading test words...", end="", flush=True)
        word_ims_test, char_poss_test = _load_words(filenames_test)
        print("done")

        test_range_start = 100
        test_range_end = 150
        distance_test = build_distance_test(
            word_ims_test[test_range_start:test_range_end],
            char_poss_test[test_range_start:test_range_end])

        if False:
            # test the old peak-finding and connected component methods
            def build_find_thresh_peaks(peak_sigma, mean_divisor):
                """helper"""
                return partial(findletters.find_thresh_peaks,
                               peak_sigma=peak_sigma,
                               mean_divisor=mean_divisor)

            res = func.grid_search(func.pipe(build_find_thresh_peaks,
                                             distance_test),
                                   peak_sigma=[1.0, 1.5, 2.0, 2.5],
                                   mean_divisor=[0.7, 1.0, 1.3, 1.4, 1.6])
            for config, score in res:
                print("peaks (", config["peak_sigma"], config["mean_divisor"],
                      ") :", score)

            find_comp = lambda x: findwords.find_conc_comp(x[16:-16, :],
                                                           merge=True)
            score = distance_test(find_comp, False)
            print("connected components:", score)

            find_comp_peaks = lambda word_im: findletters.find_combine(
                word_im, extract_char, find_comp, findletters.find_thresh_peaks
            )
            score = distance_test(find_comp_peaks)
            print("connected components + peaks:", score)

        def img_to_prob(img):
            """helper"""
            res = classifier.predict_proba(feat_extractor(img))
            # probabilities are False, True in 1x2 tensor
            # so [0, 1] is the True probability
            return res[0, 1]

        # find_comp = lambda x: findwords.find_conc_comp(x[16:-16, :], merge=True)
        # find_prob = lambda word_im: findletters.find_prob(
        #     word_im, half_width, extract_char, img_to_prob, thresh_true)
        # find_combine = lambda word_im: findletters.find_combine(
        #     word_im, extract_char,
        #     find_comp,
        #     find_prob)
        # score = distance_test(find_combine, False)
        # print("connected components + ML (", thresh_true, ") :", score)

        for thresh in [0.5, 0.6,
                       0.7]:  # [0.0, 0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0]:
            find_prob = lambda word_im: findletters.find_prob(
                word_im, config.half_width, extract_char, img_to_prob, thresh)
            score = distance_test(find_prob, False)
            print("ML (", thresh, ") :", score)
示例#5
0
def main(argv):
    """main program"""

    if len(argv) < 2:
        mode = MODE_TUNE
    else:
        mode = argv[1]

    if len(argv) < 3:
        config = CONFIG_DEFAULT
    else:
        config = load_config(argv[2])

    if len(argv) < 4:
        model_filename = "models/classify_characters.pkl"
    else:
        model_filename = argv[3]

    print("run_charclassml")
    print("---------------")
    cf.pretty_print(config)
    print("mode:", mode)
    print("model filename:", model_filename)

    torch.manual_seed(0)

    pad_image = partial(improc.pad_image,
                        width=config.pad_width,
                        height=config.pad_height)

    augment_func = pipe(
        pad_image,  # pad before rotations
        partial(improc.transform_random,
                trans_size=[config.trans_x_size, config.trans_y_size],
                rot_size=config.rot_size,
                scale_size=config.scale_size))

    filenames_train = data.pages(config.train.idxs)
    filenames_dev = data.pages(config.dev.idxs)
    filenames_test = data.pages(config.test.idxs)

    # for integration testing
    # filenames_train = data.pages([5, 6, 7])
    # filenames_dev = data.pages([8])
    # filenames_test = data.pages([9])

    print("loading and preparing datasets...")

    print("train files:", filenames_train)
    data_train_raw, labels_train_raw = _load_samples(filenames_train)
    data_train, labels_train = dataset.prepare(
        data_train_raw, labels_train_raw, config.train.do_subsample,
        config.train.subsample_size, config.train.do_prep_balance,
        config.train.do_balance, config.train.balance_size,
        config.train.do_augment, config.train.augment_size, augment_func)

    print("dev files:", filenames_dev)
    data_dev_raw, labels_dev_raw = _load_samples(filenames_dev)
    data_dev, labels_dev = dataset.prepare(
        data_dev_raw, labels_dev_raw, config.dev.do_subsample,
        config.dev.subsample_size, config.dev.do_prep_balance,
        config.dev.do_balance, config.dev.balance_size, config.dev.do_augment,
        config.dev.augment_size, augment_func)

    print("test files:", filenames_test)
    data_test_raw, labels_test_raw = _load_samples(filenames_test)
    data_test, labels_test = dataset.prepare(
        data_test_raw, labels_test_raw, config.test.do_subsample,
        config.test.subsample_size, config.test.do_prep_balance,
        config.test.do_balance, config.test.balance_size,
        config.test.do_augment, config.test.augment_size, augment_func)

    # filter by label

    min_label_examples = 1
    keep_labels = sorted([
        x for x, y in ml.group_by_label(data_train, labels_train)
        if len(y) >= min_label_examples and x not in IGNORE_CHARS
    ])
    data_train, labels_train = dataset.filter_labels(data_train, labels_train,
                                                     keep_labels)
    data_dev, labels_dev = dataset.filter_labels(data_dev, labels_dev,
                                                 keep_labels)
    data_test, labels_test = dataset.filter_labels(data_test, labels_test,
                                                   keep_labels)

    print("done")

    print("train data size:", util.mbs(data_train), "MiB")
    print("dev data size:  ", util.mbs(data_dev), "MiB")
    print("test data size: ", util.mbs(data_test), "MiB")
    print("train count:    ", len(data_train))
    print("dev count:      ", len(data_dev))
    print("test count:     ", len(data_test))
    print()

    counts_train = ml.label_counts(labels_train)
    print("train group sizes:", counts_train[0])
    print()
    counts_dev = ml.label_counts(labels_dev)
    print("dev group sizes:", counts_dev[0])
    print()
    counts_test = ml.label_counts(labels_test)
    print("test group sizes:", counts_test[0])
    print()

    # print("training group sizes change in balancing:")
    # for x, y in train_unbalanced_counts[0]:
    #     count = train_counts[1].get(x, 0)
    #     print(x, round(count / y, 3))
    # print()

    if mode == MODE_TRAIN:

        print("training model...")

        if True:
            # train a CNN

            prepare_callback = build_prepare_callback(data_dev, labels_dev)

            proc = imml.build_classification_process_cnn(
                data_train,
                labels_train,
                config.pad_width,
                config.pad_height,
                config.start_row,
                do_align=config.do_align,
                nn_arch=config.nn_arch,
                nn_opt=config.nn_opt,
                epoch_log_filename=model_filename + ".log.txt",
                prepare_callback=prepare_callback,
                save_model_filename=model_filename + ".wip",
                tsv_filename=model_filename + ".status")
        else:
            # traditional ML
            proc = imml.build_classification_process_charclass(
                data_train, labels_train, config.pad_width, config.pad_height,
                config.start_row)

        classify_char_image, prep_image, feat_extractor, classifier = proc

        print("done")

        # summarize results

        labels_test_pred = [classify_char_image(x) for x in data_test]
        print("score on test dataset:",
              sklearn.metrics.accuracy_score(labels_test, labels_test_pred))

        print("confusion matrix:")
        confusion_mat = sklearn.metrics.confusion_matrix(
            labels_test, labels_test_pred, keep_labels)
        print(confusion_mat)
        np.savetxt(model_filename + ".confusion.tsv",
                   confusion_mat,
                   fmt="%d",
                   delimiter="\t",
                   header="\t".join(keep_labels))

        util.save_dill(proc, model_filename)

        if VISUALIZE:
            chars_confirmed = []
            chars_redo = []

            # show results
            for cur_label, group in ml.group_by_label(data_test,
                                                      labels_test_pred):
                print(cur_label)
                group_prepped = [(prep_image(x), None) for x in group]
                group_pred = [
                    Sample(x, cur_label, 0.0, False) for x in group_prepped
                ]
                chars_working, chars_done = charclass.label_chars(group_pred)
                chars_confirmed += chars_working
                chars_redo += chars_done

    if mode == "tune":

        classify_char_image = util.load_dill(model_filename)[0]

        # evaluate score by label
        for label in keep_labels:

            keep_idxs = [
                idx for idx, lbl in enumerate(labels_test) if lbl == label
            ]

            data_test_subset = [data_test[idx] for idx in keep_idxs]
            labels_test_subset = [labels_test[idx] for idx in keep_idxs]
            labels_test_pred_subset = [
                classify_char_image(x) for x in data_test_subset
            ]

            preds_grouped_counts = ml.group_by_label(data_test_subset,
                                                     labels_test_pred_subset)

            # print(labels_test_pred_subset)

            score = sklearn.metrics.accuracy_score(labels_test_subset,
                                                   labels_test_pred_subset)
            print(label, "\t", np.round(score, 3), "\t", len(keep_idxs), "\t",
                  [(x[0], len(x[1])) for x in reversed(preds_grouped_counts)])