def generate_sets(neighbours, blur_scale, verbose=0): """Split the entire dataset into a trainingset and a testset.""" suffix = '_%s_%s' % (blur_scale, neighbours) learning_set_file = 'learning_set%s.dat' % suffix test_set_file = 'test_set%s.dat' % suffix chars = load_characters(neighbours, blur_scale, verbose=verbose) if verbose: print 'Going to generate learning set and test set...' learning_set = [] test_set = [] learned = [] for char in chars: if learned.count(char.value) == 70: test_set.append(char) else: learning_set.append(char) learned.append(char.value) if verbose: print 'Learning set:', [c.value for c in learning_set] print '\nTest set:', [c.value for c in test_set] print '\nSaving learning set...' fdump(learning_set, learning_set_file) if verbose: print 'Saving test set...' fdump(test_set, test_set_file) return learning_set, test_set
def load_characters(neighbours, blur_scale, verbose=0): chars_file = 'characters_%s_%s.dat' % (blur_scale, neighbours) if exists(chars_file): print 'Loading characters...' chars = fload(chars_file) else: print 'Going to generate character objects...' chars = [] for char in sorted(listdir(IMAGES_FOLDER)): count = 0 for image in sorted(listdir(IMAGES_FOLDER + char)): image = GrayscaleImage(IMAGES_FOLDER + char + '/' + image) norm = NormalizedCharacterImage(image, blur=blur_scale, \ height=NORMALIZED_HEIGHT) character = Character(char, [], norm) character.get_single_cell_feature_vector(neighbours) chars.append(character) count += 1 if verbose: print 'Loaded character %s %d times' % (char, count) if verbose: print 'Saving characters...' fdump(chars, chars_file) return chars