def prepare_callback(feat_extractor): """given feature extractor functions and validation data, build callback to validate the network during training""" feats_validate = [feat_extractor(x) for x in data_validate] print("validation features size:", util.mbs(feats_validate), "MiB") def callback(classifier): """helper""" def img_to_prob(img): """helper""" res = classifier.predict_proba(feat_extractor(img)) # probabilities are False, True in 1x2 tensor # so [0, 1] is the True probability return res[0, 1] print("distance test...", end="", flush=True) find_prob = build_find_prob(img_to_prob) distance = distance_test(find_prob, False) print("done") print("validation distance:", distance) print("predicting...", end="", flush=True) probs_true_pred = [ classifier.predict_proba(x)[0, 1] for x in feats_validate ] labels_validate_pred = [x > 0.5 for x in probs_true_pred] print("done") # TODO: something generic here instead of sklearn fpr, tpr, _ = sklearn.metrics.roc_curve(labels_validate, probs_true_pred) roc_auc = sklearn.metrics.auc(fpr, tpr) print("validation ROC AUC:", roc_auc) accuracy = sklearn.metrics.accuracy_score(labels_validate, labels_validate_pred) print("validation accuracy:", accuracy) return [("val_distance", distance), ("val_roc_auc", roc_auc), ("val_accuracy", accuracy)] return callback
def prepare_callback(feat_extractor): """given feature extractor, build callback to test the network during training""" feats_validate = [feat_extractor(x) for x in data_validate] print("validation features size:", util.mbs(feats_validate), "MiB") def callback(classifier): """helper""" print("predicting...", end="", flush=True) labels_validate_pred = [classifier(x) for x in feats_validate] print("done") # TODO: something generic here instead of sklearn accuracy = sklearn.metrics.accuracy_score(labels_validate, labels_validate_pred) print("validation accuracy:", accuracy) return [("nothing", 0.0), ("nothing", 0.0), ("val_accuracy", accuracy)] return callback
def main(argv): """main program""" if len(argv) < 2: mode = MODE_TUNE else: mode = argv[1] if len(argv) < 3: config = CONFIG_DEFAULT else: config = load_config(argv[2]) if len(argv) < 4: model_filename = "models/classify_charpos.pkl" else: model_filename = argv[3] print("run_charposml") print("---------------") cf.pretty_print(config) print("mode:", mode) print("model filename:", model_filename) torch.manual_seed(0) thresh_true = 0.5 pad_image = partial(improc.pad_image, width=config.half_width * 2, height=config.pad_height) augment_func = pipe( pad_image, # pad before transformations partial(improc.transform_random, trans_size=[config.trans_x_size, config.trans_y_size], rot_size=config.rot_size, scale_size=config.scale_size)) filenames_train = data.pages(config.train.idxs) filenames_dev = data.pages(config.dev.idxs) filenames_test = data.pages(config.test.idxs) # for integration testing # filenames_train = data.pages([5, 6, 7]) # filenames_dev = data.pages([8]) # filenames_test = data.pages([9]) print("loading and preparing datasets...") print("train files:", filenames_train) data_train_raw, labels_train_raw = _load_samples(filenames_train, config.half_width, config.offset) data_train, labels_train = dataset.prepare( data_train_raw, labels_train_raw, config.train.do_subsample, config.train.subsample_size, config.train.do_prep_balance, config.train.do_balance, config.train.balance_size, config.train.do_augment, config.train.augment_size, augment_func) print("dev files:", filenames_dev) data_dev_raw, labels_dev_raw = _load_samples(filenames_dev, config.half_width, config.offset) data_dev, labels_dev = dataset.prepare( data_dev_raw, labels_dev_raw, config.dev.do_subsample, config.dev.subsample_size, config.dev.do_prep_balance, config.dev.do_balance, config.dev.balance_size, config.dev.do_augment, config.dev.augment_size, augment_func) print("test files:", filenames_test) data_test_raw, labels_test_raw = _load_samples(filenames_test, config.half_width, config.offset) data_test, labels_test = dataset.prepare( data_test_raw, labels_test_raw, config.test.do_subsample, config.test.subsample_size, config.test.do_prep_balance, config.test.do_balance, config.test.balance_size, config.test.do_augment, config.test.augment_size, augment_func) print("done") print("train data size:", util.mbs(data_train), "MiB") print("dev data size: ", util.mbs(data_dev), "MiB") print("test data size: ", util.mbs(data_test), "MiB") print("train count: ", len(data_train)) print("dev count: ", len(data_dev)) print("test count: ", len(data_test)) print() counts_train = ml.label_counts(labels_train) print("train group sizes:", counts_train[0]) print() counts_dev = ml.label_counts(labels_dev) print("dev group sizes:", counts_dev[0]) print() counts_test = ml.label_counts(labels_test) print("test group sizes:", counts_test[0]) print() # print("training group sizes change in balancing:") # for x, y in train_unbalanced_counts[0]: # count = train_counts[1].get(x, 0) # print(x, round(count / y, 3)) # print() print("discarding letter information from labels") print() labels_train = [x[0] for x in labels_train] labels_dev = [x[0] for x in labels_dev] labels_test = [x[0] for x in labels_test] counts_train = ml.label_counts(labels_train) print("train group sizes:", counts_train[0]) print() counts_dev = ml.label_counts(labels_dev) print("dev group sizes:", counts_dev[0]) print() counts_test = ml.label_counts(labels_test) print("test group sizes:", counts_test[0]) print() extract_char = improc.extract_pos if mode == MODE_TRAIN: print("training model...") # word_ims_test, char_poss_test = _load_words(test_filenames) # distance_test = build_distance_test(word_ims_test, char_poss_test) distance_test = lambda x, y: 0.0 if True: # train a CNN def build_find_prob(img_to_prob): return lambda word_im: findletters.find_prob( word_im, config.half_width, extract_char, img_to_prob, thresh_true) prepare_callback = build_prepare_validation_callback( data_dev, labels_dev, build_find_prob, distance_test) proc = imml.build_classification_process_cnn( data_train, labels_train, config.half_width * 2, # - 8 # I forget the reason I was doing this config.pad_height, config.start_row, do_align=False, nn_arch=config.nn_arch, nn_opt=config.nn_opt, epoch_log_filename=model_filename + ".log.txt", prepare_callback=prepare_callback, save_model_filename=model_filename + ".wip", tsv_filename=model_filename + ".status") else: # traditional ML proc = imml.build_classification_process_charpos( data_train, labels_train, config.half_width * 2, # - 8, config.pad_height, config.start_row) classify_char_pos, prep_image, feat_extractor, classifier = proc print("done") # summarize results feats_test = [feat_extractor(x) for x in data_test] labels_test_pred = [classifier(x) for x in feats_test] print("accuracy score on test dataset:", sklearn.metrics.accuracy_score(labels_test, labels_test_pred)) print("confusion matrix:") print( sklearn.metrics.confusion_matrix(labels_test, labels_test_pred, [True, False])) # save model util.save_dill(proc, model_filename) if mode == MODE_TUNE: # load model proc = util.load_dill(model_filename) classify_char_pos, prep_image, feat_extractor, classifier = proc # predict on test data feats_test = [feat_extractor(x) for x in data_test] labels_test_pred = [classifier(x) for x in feats_test] # visualize boolean predictions if VISUALIZE: idx = 0 slices_per_im = 320 while idx < len(data_test): disp_im = _visualize_boolean_predictions( data_test[idx:(idx + slices_per_im)], labels_test[idx:(idx + slices_per_im)], labels_test_pred[idx:(idx + slices_per_im)], config.half_width, config.offset) cv2.namedWindow("boolean predictions", cv2.WINDOW_NORMAL) cv2.imshow("boolean predictions", disp_im) cv2.waitKey() idx = idx + slices_per_im # calculate and visualize ROC AUC if False and VISUALIZE: # distances_test = model.decision_function(feats_test) # distances_test = classifier.model.predict_proba(feats_test)[:, 1] distances_test = [ classifier.predict_proba(x)[0, 1] for x in feats_test ] fpr, tpr, _ = sklearn.metrics.roc_curve(labels_test, distances_test) roc_auc = sklearn.metrics.auc(fpr, tpr) print("ROC AUC on test dataset:", roc_auc) if VISUALIZE: # visualize ROC curve from matplotlib import pyplot as plt plt.figure() plt.plot(fpr, tpr, color="red", lw=2, label="ROC curve (area = " + str(roc_auc) + ")") plt.plot([0, 1], [0, 1], color="blue", lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel("FPR") plt.ylabel("TPR") plt.title("ROC") plt.legend(loc="lower right") plt.show() if False and VISUALIZE: # visualize result images # labels_test_pred = classify_char_pos(data_test) chars_confirmed = [] chars_redo = [] # show results for cur_label, group in ml.group_by_label(data_test, labels_test_pred): print(cur_label) group_prepped = [(prep_image(x), None) for x in group] group_pred = [ Sample(x, cur_label, 0.0, False) for x in group_prepped ] chars_working, chars_done = charclass.label_chars(group_pred) chars_confirmed += chars_working chars_redo += chars_done # test different position finding methods using a distance function # on each word print("loading test words...", end="", flush=True) word_ims_test, char_poss_test = _load_words(filenames_test) print("done") test_range_start = 100 test_range_end = 150 distance_test = build_distance_test( word_ims_test[test_range_start:test_range_end], char_poss_test[test_range_start:test_range_end]) if False: # test the old peak-finding and connected component methods def build_find_thresh_peaks(peak_sigma, mean_divisor): """helper""" return partial(findletters.find_thresh_peaks, peak_sigma=peak_sigma, mean_divisor=mean_divisor) res = func.grid_search(func.pipe(build_find_thresh_peaks, distance_test), peak_sigma=[1.0, 1.5, 2.0, 2.5], mean_divisor=[0.7, 1.0, 1.3, 1.4, 1.6]) for config, score in res: print("peaks (", config["peak_sigma"], config["mean_divisor"], ") :", score) find_comp = lambda x: findwords.find_conc_comp(x[16:-16, :], merge=True) score = distance_test(find_comp, False) print("connected components:", score) find_comp_peaks = lambda word_im: findletters.find_combine( word_im, extract_char, find_comp, findletters.find_thresh_peaks ) score = distance_test(find_comp_peaks) print("connected components + peaks:", score) def img_to_prob(img): """helper""" res = classifier.predict_proba(feat_extractor(img)) # probabilities are False, True in 1x2 tensor # so [0, 1] is the True probability return res[0, 1] # find_comp = lambda x: findwords.find_conc_comp(x[16:-16, :], merge=True) # find_prob = lambda word_im: findletters.find_prob( # word_im, half_width, extract_char, img_to_prob, thresh_true) # find_combine = lambda word_im: findletters.find_combine( # word_im, extract_char, # find_comp, # find_prob) # score = distance_test(find_combine, False) # print("connected components + ML (", thresh_true, ") :", score) for thresh in [0.5, 0.6, 0.7]: # [0.0, 0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0]: find_prob = lambda word_im: findletters.find_prob( word_im, config.half_width, extract_char, img_to_prob, thresh) score = distance_test(find_prob, False) print("ML (", thresh, ") :", score)
def build_classification_process_cnn(data_train, labels_train, pad_width, pad_height, start_row, do_align, nn_arch, nn_opt, epoch_log_filename, prepare_callback, save_model_filename, tsv_filename): """build a classification process for images using a CNN""" # TODO: do I pull out all the feature extractors or flatten the configuration? # I'm more inclined to flatten the configuration for this approach. pad_only = partial(improc.pad_image, width=pad_width, height=pad_height) if do_align: pad_image = lambda x: improc.align(pad_only(x)) else: pad_image = pad_only def color_to_grayuint(image): """prepare image to uint8""" image = image[start_row:, :] return np.array(255.0 - improc.grayscale(pad_image(image)), dtype=np.uint8) def grayuint_to_grayfloat(image): """convert uint8 image to floating point""" img_g = image / 255.0 - 0.5 return np.array(img_g, np.float32) if VISUALIZE: charclass.visualize_training_data(data_train, labels_train, color_to_grayuint) feat_extractor = func.pipe(color_to_grayuint, grayuint_to_grayfloat) print("preparing callback...", end="", flush=True) callback = prepare_callback(feat_extractor) print("done") if not DO_LAZY_EXTRACTION: feats_train = [feat_extractor(x) for x in data_train] lazy_extractor = None else: feats_train = [color_to_grayuint(x) for x in data_train] del data_train gc.collect() lazy_extractor = grayuint_to_grayfloat print("training features (input to CNN) size:", util.mbs(feats_train), "MiB") if prepare_callback is not None: filename, ext = os.path.splitext(epoch_log_filename) callback_log_filename = filename + "_callback" + ext else: callback_log_filename = None classifier = cnn.experimental_cnn( nn_arch=nn_arch, nn_opt=nn_opt, epoch_log_filename=epoch_log_filename, callback_log_filename=callback_log_filename, callback=callback, callback_rate=CALLBACK_RATE, lazy_extractor=lazy_extractor, save_model_filename=save_model_filename, tsv_filename=tsv_filename)(feats_train, labels_train) classify_image = lambda image: classifier(feat_extractor(image)) return (classify_image, color_to_grayuint, feat_extractor, classifier)
def prepare(data_input, labels_input, do_subsample, subsample_size, do_prep_balance, do_balance, balance_size, do_augment, augment_size, augment_func, seed=0): """prepare a dataset from a raw list of data and labels by subsampling, balancing, and augmenting""" # WARNING! this function is not pure! # - destroys its inputs to save memory # - resets RNG seeds before each operation data = list(data_input) labels = list(labels_input) data_input.clear() labels_input.clear() if VERBOSE: print("\tunprepared data size:", util.mbs(data), "MiB") label_counts = ml.label_counts(labels) print("\tunprepared data group sizes:", label_counts[0]) if do_subsample: if VERBOSE: print("\tsubsampling with size", subsample_size) np.random.seed(seed) random.seed(seed) data, labels = ml.subsample(data, labels, subsample_size) gc.collect() if do_prep_balance: if VERBOSE: print("\tpreparing for balance") np.random.seed(seed) random.seed(seed) data, labels = ml.prepare_balance(data, labels, balance_size) gc.collect() if do_subsample or do_prep_balance: # turn everything that's left into fresh arrays rather than memory views print("\tconverting views to arrays to free old data") data = [np.copy(x) for x in data] gc.collect() if do_balance: if VERBOSE: print("\tbalancing with size", balance_size) np.random.seed(seed) random.seed(seed) data, labels = ml.balance(data, labels, balance_size, augment_func) gc.collect() if do_augment: if VERBOSE: print("\taugmenting with size", augment_size) np.random.seed(seed) random.seed(seed) data, labels = ml.augment(data, labels, augment_size, augment_func) if VERBOSE: print("\tdone") return data, labels
def main(argv): """main program""" if len(argv) < 2: mode = MODE_TUNE else: mode = argv[1] if len(argv) < 3: config = CONFIG_DEFAULT else: config = load_config(argv[2]) if len(argv) < 4: model_filename = "models/classify_characters.pkl" else: model_filename = argv[3] print("run_charclassml") print("---------------") cf.pretty_print(config) print("mode:", mode) print("model filename:", model_filename) torch.manual_seed(0) pad_image = partial(improc.pad_image, width=config.pad_width, height=config.pad_height) augment_func = pipe( pad_image, # pad before rotations partial(improc.transform_random, trans_size=[config.trans_x_size, config.trans_y_size], rot_size=config.rot_size, scale_size=config.scale_size)) filenames_train = data.pages(config.train.idxs) filenames_dev = data.pages(config.dev.idxs) filenames_test = data.pages(config.test.idxs) # for integration testing # filenames_train = data.pages([5, 6, 7]) # filenames_dev = data.pages([8]) # filenames_test = data.pages([9]) print("loading and preparing datasets...") print("train files:", filenames_train) data_train_raw, labels_train_raw = _load_samples(filenames_train) data_train, labels_train = dataset.prepare( data_train_raw, labels_train_raw, config.train.do_subsample, config.train.subsample_size, config.train.do_prep_balance, config.train.do_balance, config.train.balance_size, config.train.do_augment, config.train.augment_size, augment_func) print("dev files:", filenames_dev) data_dev_raw, labels_dev_raw = _load_samples(filenames_dev) data_dev, labels_dev = dataset.prepare( data_dev_raw, labels_dev_raw, config.dev.do_subsample, config.dev.subsample_size, config.dev.do_prep_balance, config.dev.do_balance, config.dev.balance_size, config.dev.do_augment, config.dev.augment_size, augment_func) print("test files:", filenames_test) data_test_raw, labels_test_raw = _load_samples(filenames_test) data_test, labels_test = dataset.prepare( data_test_raw, labels_test_raw, config.test.do_subsample, config.test.subsample_size, config.test.do_prep_balance, config.test.do_balance, config.test.balance_size, config.test.do_augment, config.test.augment_size, augment_func) # filter by label min_label_examples = 1 keep_labels = sorted([ x for x, y in ml.group_by_label(data_train, labels_train) if len(y) >= min_label_examples and x not in IGNORE_CHARS ]) data_train, labels_train = dataset.filter_labels(data_train, labels_train, keep_labels) data_dev, labels_dev = dataset.filter_labels(data_dev, labels_dev, keep_labels) data_test, labels_test = dataset.filter_labels(data_test, labels_test, keep_labels) print("done") print("train data size:", util.mbs(data_train), "MiB") print("dev data size: ", util.mbs(data_dev), "MiB") print("test data size: ", util.mbs(data_test), "MiB") print("train count: ", len(data_train)) print("dev count: ", len(data_dev)) print("test count: ", len(data_test)) print() counts_train = ml.label_counts(labels_train) print("train group sizes:", counts_train[0]) print() counts_dev = ml.label_counts(labels_dev) print("dev group sizes:", counts_dev[0]) print() counts_test = ml.label_counts(labels_test) print("test group sizes:", counts_test[0]) print() # print("training group sizes change in balancing:") # for x, y in train_unbalanced_counts[0]: # count = train_counts[1].get(x, 0) # print(x, round(count / y, 3)) # print() if mode == MODE_TRAIN: print("training model...") if True: # train a CNN prepare_callback = build_prepare_callback(data_dev, labels_dev) proc = imml.build_classification_process_cnn( data_train, labels_train, config.pad_width, config.pad_height, config.start_row, do_align=config.do_align, nn_arch=config.nn_arch, nn_opt=config.nn_opt, epoch_log_filename=model_filename + ".log.txt", prepare_callback=prepare_callback, save_model_filename=model_filename + ".wip", tsv_filename=model_filename + ".status") else: # traditional ML proc = imml.build_classification_process_charclass( data_train, labels_train, config.pad_width, config.pad_height, config.start_row) classify_char_image, prep_image, feat_extractor, classifier = proc print("done") # summarize results labels_test_pred = [classify_char_image(x) for x in data_test] print("score on test dataset:", sklearn.metrics.accuracy_score(labels_test, labels_test_pred)) print("confusion matrix:") confusion_mat = sklearn.metrics.confusion_matrix( labels_test, labels_test_pred, keep_labels) print(confusion_mat) np.savetxt(model_filename + ".confusion.tsv", confusion_mat, fmt="%d", delimiter="\t", header="\t".join(keep_labels)) util.save_dill(proc, model_filename) if VISUALIZE: chars_confirmed = [] chars_redo = [] # show results for cur_label, group in ml.group_by_label(data_test, labels_test_pred): print(cur_label) group_prepped = [(prep_image(x), None) for x in group] group_pred = [ Sample(x, cur_label, 0.0, False) for x in group_prepped ] chars_working, chars_done = charclass.label_chars(group_pred) chars_confirmed += chars_working chars_redo += chars_done if mode == "tune": classify_char_image = util.load_dill(model_filename)[0] # evaluate score by label for label in keep_labels: keep_idxs = [ idx for idx, lbl in enumerate(labels_test) if lbl == label ] data_test_subset = [data_test[idx] for idx in keep_idxs] labels_test_subset = [labels_test[idx] for idx in keep_idxs] labels_test_pred_subset = [ classify_char_image(x) for x in data_test_subset ] preds_grouped_counts = ml.group_by_label(data_test_subset, labels_test_pred_subset) # print(labels_test_pred_subset) score = sklearn.metrics.accuracy_score(labels_test_subset, labels_test_pred_subset) print(label, "\t", np.round(score, 3), "\t", len(keep_idxs), "\t", [(x[0], len(x[1])) for x in reversed(preds_grouped_counts)])