def filter_labels(data, labels, keep_labels): """filter a dataset by label""" grouped = dict(ml.group_by_label(data, labels)) grouped_filtered = {x: y for x, y in grouped.items() if x in keep_labels} return zip(*[(y, x[0]) for x in grouped_filtered.items() for y in x[1]])
def visualize_training_data(data_train, labels_train, color_to_gray): """visualize training data""" for cur_label, group in ml.group_by_label(data_train, labels_train): print("label:", cur_label) group_prepped = [(color_to_gray(x), None) for x in group] group_pred = [Sample(x, cur_label, 0.0, False) for x in group_prepped] _ = label_chars(group_pred)
def test_balance(self): """test dataset balancing functionality""" data, labels = zip(*([(0.0, "a")] * 4 + [(1.0, "b")] * 1)) balanced_data, balanced_labels = ml.balance(data, labels, 0.5, lambda x: x) balanced_grouped = dict( ml.group_by_label(balanced_data, balanced_labels)) for label, group in balanced_grouped.items(): self.assertEqual(len(group), 2) balanced_data, balanced_labels = ml.balance(data, labels, 8, lambda x: x) balanced_grouped = dict( ml.group_by_label(balanced_data, balanced_labels)) for label, group in balanced_grouped.items(): self.assertEqual(len(group), 8)
def main(argv): """main program""" if len(argv) < 2: mode = MODE_TUNE else: mode = argv[1] if len(argv) < 3: config = CONFIG_DEFAULT else: config = load_config(argv[2]) if len(argv) < 4: model_filename = "models/classify_charpos.pkl" else: model_filename = argv[3] print("run_charposml") print("---------------") cf.pretty_print(config) print("mode:", mode) print("model filename:", model_filename) torch.manual_seed(0) thresh_true = 0.5 pad_image = partial(improc.pad_image, width=config.half_width * 2, height=config.pad_height) augment_func = pipe( pad_image, # pad before transformations partial(improc.transform_random, trans_size=[config.trans_x_size, config.trans_y_size], rot_size=config.rot_size, scale_size=config.scale_size)) filenames_train = data.pages(config.train.idxs) filenames_dev = data.pages(config.dev.idxs) filenames_test = data.pages(config.test.idxs) # for integration testing # filenames_train = data.pages([5, 6, 7]) # filenames_dev = data.pages([8]) # filenames_test = data.pages([9]) print("loading and preparing datasets...") print("train files:", filenames_train) data_train_raw, labels_train_raw = _load_samples(filenames_train, config.half_width, config.offset) data_train, labels_train = dataset.prepare( data_train_raw, labels_train_raw, config.train.do_subsample, config.train.subsample_size, config.train.do_prep_balance, config.train.do_balance, config.train.balance_size, config.train.do_augment, config.train.augment_size, augment_func) print("dev files:", filenames_dev) data_dev_raw, labels_dev_raw = _load_samples(filenames_dev, config.half_width, config.offset) data_dev, labels_dev = dataset.prepare( data_dev_raw, labels_dev_raw, config.dev.do_subsample, config.dev.subsample_size, config.dev.do_prep_balance, config.dev.do_balance, config.dev.balance_size, config.dev.do_augment, config.dev.augment_size, augment_func) print("test files:", filenames_test) data_test_raw, labels_test_raw = _load_samples(filenames_test, config.half_width, config.offset) data_test, labels_test = dataset.prepare( data_test_raw, labels_test_raw, config.test.do_subsample, config.test.subsample_size, config.test.do_prep_balance, config.test.do_balance, config.test.balance_size, config.test.do_augment, config.test.augment_size, augment_func) print("done") print("train data size:", util.mbs(data_train), "MiB") print("dev data size: ", util.mbs(data_dev), "MiB") print("test data size: ", util.mbs(data_test), "MiB") print("train count: ", len(data_train)) print("dev count: ", len(data_dev)) print("test count: ", len(data_test)) print() counts_train = ml.label_counts(labels_train) print("train group sizes:", counts_train[0]) print() counts_dev = ml.label_counts(labels_dev) print("dev group sizes:", counts_dev[0]) print() counts_test = ml.label_counts(labels_test) print("test group sizes:", counts_test[0]) print() # print("training group sizes change in balancing:") # for x, y in train_unbalanced_counts[0]: # count = train_counts[1].get(x, 0) # print(x, round(count / y, 3)) # print() print("discarding letter information from labels") print() labels_train = [x[0] for x in labels_train] labels_dev = [x[0] for x in labels_dev] labels_test = [x[0] for x in labels_test] counts_train = ml.label_counts(labels_train) print("train group sizes:", counts_train[0]) print() counts_dev = ml.label_counts(labels_dev) print("dev group sizes:", counts_dev[0]) print() counts_test = ml.label_counts(labels_test) print("test group sizes:", counts_test[0]) print() extract_char = improc.extract_pos if mode == MODE_TRAIN: print("training model...") # word_ims_test, char_poss_test = _load_words(test_filenames) # distance_test = build_distance_test(word_ims_test, char_poss_test) distance_test = lambda x, y: 0.0 if True: # train a CNN def build_find_prob(img_to_prob): return lambda word_im: findletters.find_prob( word_im, config.half_width, extract_char, img_to_prob, thresh_true) prepare_callback = build_prepare_validation_callback( data_dev, labels_dev, build_find_prob, distance_test) proc = imml.build_classification_process_cnn( data_train, labels_train, config.half_width * 2, # - 8 # I forget the reason I was doing this config.pad_height, config.start_row, do_align=False, nn_arch=config.nn_arch, nn_opt=config.nn_opt, epoch_log_filename=model_filename + ".log.txt", prepare_callback=prepare_callback, save_model_filename=model_filename + ".wip", tsv_filename=model_filename + ".status") else: # traditional ML proc = imml.build_classification_process_charpos( data_train, labels_train, config.half_width * 2, # - 8, config.pad_height, config.start_row) classify_char_pos, prep_image, feat_extractor, classifier = proc print("done") # summarize results feats_test = [feat_extractor(x) for x in data_test] labels_test_pred = [classifier(x) for x in feats_test] print("accuracy score on test dataset:", sklearn.metrics.accuracy_score(labels_test, labels_test_pred)) print("confusion matrix:") print( sklearn.metrics.confusion_matrix(labels_test, labels_test_pred, [True, False])) # save model util.save_dill(proc, model_filename) if mode == MODE_TUNE: # load model proc = util.load_dill(model_filename) classify_char_pos, prep_image, feat_extractor, classifier = proc # predict on test data feats_test = [feat_extractor(x) for x in data_test] labels_test_pred = [classifier(x) for x in feats_test] # visualize boolean predictions if VISUALIZE: idx = 0 slices_per_im = 320 while idx < len(data_test): disp_im = _visualize_boolean_predictions( data_test[idx:(idx + slices_per_im)], labels_test[idx:(idx + slices_per_im)], labels_test_pred[idx:(idx + slices_per_im)], config.half_width, config.offset) cv2.namedWindow("boolean predictions", cv2.WINDOW_NORMAL) cv2.imshow("boolean predictions", disp_im) cv2.waitKey() idx = idx + slices_per_im # calculate and visualize ROC AUC if False and VISUALIZE: # distances_test = model.decision_function(feats_test) # distances_test = classifier.model.predict_proba(feats_test)[:, 1] distances_test = [ classifier.predict_proba(x)[0, 1] for x in feats_test ] fpr, tpr, _ = sklearn.metrics.roc_curve(labels_test, distances_test) roc_auc = sklearn.metrics.auc(fpr, tpr) print("ROC AUC on test dataset:", roc_auc) if VISUALIZE: # visualize ROC curve from matplotlib import pyplot as plt plt.figure() plt.plot(fpr, tpr, color="red", lw=2, label="ROC curve (area = " + str(roc_auc) + ")") plt.plot([0, 1], [0, 1], color="blue", lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel("FPR") plt.ylabel("TPR") plt.title("ROC") plt.legend(loc="lower right") plt.show() if False and VISUALIZE: # visualize result images # labels_test_pred = classify_char_pos(data_test) chars_confirmed = [] chars_redo = [] # show results for cur_label, group in ml.group_by_label(data_test, labels_test_pred): print(cur_label) group_prepped = [(prep_image(x), None) for x in group] group_pred = [ Sample(x, cur_label, 0.0, False) for x in group_prepped ] chars_working, chars_done = charclass.label_chars(group_pred) chars_confirmed += chars_working chars_redo += chars_done # test different position finding methods using a distance function # on each word print("loading test words...", end="", flush=True) word_ims_test, char_poss_test = _load_words(filenames_test) print("done") test_range_start = 100 test_range_end = 150 distance_test = build_distance_test( word_ims_test[test_range_start:test_range_end], char_poss_test[test_range_start:test_range_end]) if False: # test the old peak-finding and connected component methods def build_find_thresh_peaks(peak_sigma, mean_divisor): """helper""" return partial(findletters.find_thresh_peaks, peak_sigma=peak_sigma, mean_divisor=mean_divisor) res = func.grid_search(func.pipe(build_find_thresh_peaks, distance_test), peak_sigma=[1.0, 1.5, 2.0, 2.5], mean_divisor=[0.7, 1.0, 1.3, 1.4, 1.6]) for config, score in res: print("peaks (", config["peak_sigma"], config["mean_divisor"], ") :", score) find_comp = lambda x: findwords.find_conc_comp(x[16:-16, :], merge=True) score = distance_test(find_comp, False) print("connected components:", score) find_comp_peaks = lambda word_im: findletters.find_combine( word_im, extract_char, find_comp, findletters.find_thresh_peaks ) score = distance_test(find_comp_peaks) print("connected components + peaks:", score) def img_to_prob(img): """helper""" res = classifier.predict_proba(feat_extractor(img)) # probabilities are False, True in 1x2 tensor # so [0, 1] is the True probability return res[0, 1] # find_comp = lambda x: findwords.find_conc_comp(x[16:-16, :], merge=True) # find_prob = lambda word_im: findletters.find_prob( # word_im, half_width, extract_char, img_to_prob, thresh_true) # find_combine = lambda word_im: findletters.find_combine( # word_im, extract_char, # find_comp, # find_prob) # score = distance_test(find_combine, False) # print("connected components + ML (", thresh_true, ") :", score) for thresh in [0.5, 0.6, 0.7]: # [0.0, 0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0]: find_prob = lambda word_im: findletters.find_prob( word_im, config.half_width, extract_char, img_to_prob, thresh) score = distance_test(find_prob, False) print("ML (", thresh, ") :", score)
def main(argv): """main program""" if len(argv) < 2: mode = MODE_TUNE else: mode = argv[1] if len(argv) < 3: config = CONFIG_DEFAULT else: config = load_config(argv[2]) if len(argv) < 4: model_filename = "models/classify_characters.pkl" else: model_filename = argv[3] print("run_charclassml") print("---------------") cf.pretty_print(config) print("mode:", mode) print("model filename:", model_filename) torch.manual_seed(0) pad_image = partial(improc.pad_image, width=config.pad_width, height=config.pad_height) augment_func = pipe( pad_image, # pad before rotations partial(improc.transform_random, trans_size=[config.trans_x_size, config.trans_y_size], rot_size=config.rot_size, scale_size=config.scale_size)) filenames_train = data.pages(config.train.idxs) filenames_dev = data.pages(config.dev.idxs) filenames_test = data.pages(config.test.idxs) # for integration testing # filenames_train = data.pages([5, 6, 7]) # filenames_dev = data.pages([8]) # filenames_test = data.pages([9]) print("loading and preparing datasets...") print("train files:", filenames_train) data_train_raw, labels_train_raw = _load_samples(filenames_train) data_train, labels_train = dataset.prepare( data_train_raw, labels_train_raw, config.train.do_subsample, config.train.subsample_size, config.train.do_prep_balance, config.train.do_balance, config.train.balance_size, config.train.do_augment, config.train.augment_size, augment_func) print("dev files:", filenames_dev) data_dev_raw, labels_dev_raw = _load_samples(filenames_dev) data_dev, labels_dev = dataset.prepare( data_dev_raw, labels_dev_raw, config.dev.do_subsample, config.dev.subsample_size, config.dev.do_prep_balance, config.dev.do_balance, config.dev.balance_size, config.dev.do_augment, config.dev.augment_size, augment_func) print("test files:", filenames_test) data_test_raw, labels_test_raw = _load_samples(filenames_test) data_test, labels_test = dataset.prepare( data_test_raw, labels_test_raw, config.test.do_subsample, config.test.subsample_size, config.test.do_prep_balance, config.test.do_balance, config.test.balance_size, config.test.do_augment, config.test.augment_size, augment_func) # filter by label min_label_examples = 1 keep_labels = sorted([ x for x, y in ml.group_by_label(data_train, labels_train) if len(y) >= min_label_examples and x not in IGNORE_CHARS ]) data_train, labels_train = dataset.filter_labels(data_train, labels_train, keep_labels) data_dev, labels_dev = dataset.filter_labels(data_dev, labels_dev, keep_labels) data_test, labels_test = dataset.filter_labels(data_test, labels_test, keep_labels) print("done") print("train data size:", util.mbs(data_train), "MiB") print("dev data size: ", util.mbs(data_dev), "MiB") print("test data size: ", util.mbs(data_test), "MiB") print("train count: ", len(data_train)) print("dev count: ", len(data_dev)) print("test count: ", len(data_test)) print() counts_train = ml.label_counts(labels_train) print("train group sizes:", counts_train[0]) print() counts_dev = ml.label_counts(labels_dev) print("dev group sizes:", counts_dev[0]) print() counts_test = ml.label_counts(labels_test) print("test group sizes:", counts_test[0]) print() # print("training group sizes change in balancing:") # for x, y in train_unbalanced_counts[0]: # count = train_counts[1].get(x, 0) # print(x, round(count / y, 3)) # print() if mode == MODE_TRAIN: print("training model...") if True: # train a CNN prepare_callback = build_prepare_callback(data_dev, labels_dev) proc = imml.build_classification_process_cnn( data_train, labels_train, config.pad_width, config.pad_height, config.start_row, do_align=config.do_align, nn_arch=config.nn_arch, nn_opt=config.nn_opt, epoch_log_filename=model_filename + ".log.txt", prepare_callback=prepare_callback, save_model_filename=model_filename + ".wip", tsv_filename=model_filename + ".status") else: # traditional ML proc = imml.build_classification_process_charclass( data_train, labels_train, config.pad_width, config.pad_height, config.start_row) classify_char_image, prep_image, feat_extractor, classifier = proc print("done") # summarize results labels_test_pred = [classify_char_image(x) for x in data_test] print("score on test dataset:", sklearn.metrics.accuracy_score(labels_test, labels_test_pred)) print("confusion matrix:") confusion_mat = sklearn.metrics.confusion_matrix( labels_test, labels_test_pred, keep_labels) print(confusion_mat) np.savetxt(model_filename + ".confusion.tsv", confusion_mat, fmt="%d", delimiter="\t", header="\t".join(keep_labels)) util.save_dill(proc, model_filename) if VISUALIZE: chars_confirmed = [] chars_redo = [] # show results for cur_label, group in ml.group_by_label(data_test, labels_test_pred): print(cur_label) group_prepped = [(prep_image(x), None) for x in group] group_pred = [ Sample(x, cur_label, 0.0, False) for x in group_prepped ] chars_working, chars_done = charclass.label_chars(group_pred) chars_confirmed += chars_working chars_redo += chars_done if mode == "tune": classify_char_image = util.load_dill(model_filename)[0] # evaluate score by label for label in keep_labels: keep_idxs = [ idx for idx, lbl in enumerate(labels_test) if lbl == label ] data_test_subset = [data_test[idx] for idx in keep_idxs] labels_test_subset = [labels_test[idx] for idx in keep_idxs] labels_test_pred_subset = [ classify_char_image(x) for x in data_test_subset ] preds_grouped_counts = ml.group_by_label(data_test_subset, labels_test_pred_subset) # print(labels_test_pred_subset) score = sklearn.metrics.accuracy_score(labels_test_subset, labels_test_pred_subset) print(label, "\t", np.round(score, 3), "\t", len(keep_idxs), "\t", [(x[0], len(x[1])) for x in reversed(preds_grouped_counts)])