def load_test_page(page_name, model): """Load test data page. This function must return each character as a 10-d feature vector with the vectors stored as rows of a matrix. :param page_name: name of page file :param model: dictionary storing data passed from training stage :return: The feature vector reduced to 10 dimensions """ bbox_size = model['bbox_size'] images_test = utils.load_char_images(page_name) fvectors_test = images_to_feature_vectors(images_test, bbox_size) # Remove noise from with high noise level for row in fvectors_test: col = row.flatten() noise_threshold = np.sum(col < 255) - np.sum(col == 0) # If there are a lot of noise detected in the character image, remove the noise if noise_threshold > 75: row[row < 20] = 0 row[row > 120] = 255 # Perform the dimensionality reduction. fvectors_test_reduced = reduce_dimensions_test(fvectors_test, model) return fvectors_test_reduced
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) print('Applying gaussian filter to page', page_name.split('.')[1]) images_train = [ ndimage.gaussian_filter(image, 0.9) for image in images_train ] labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size print('Reducing to 10 dimensions') model_data['pca_matrix'] = learn_pca(fvectors_train_full) fvectors_train = reduce_dimensions(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() print('Generating dictionaries of words for evaluation stage') model_data = generate_dictionaries(model_data) return model_data
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size print('Reducing to 10 dimensions') fvectors_train = reduce_dimensions(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() return model_data
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size print('Reducing to 10 dimensions') v = principal_components(fvectors_train_full, 40) model_data['v'] = v.tolist() model_data['mean'] = np.mean(fvectors_train_full).tolist() reduced = np.dot((fvectors_train_full - np.mean(fvectors_train_full)), v) f = get_ten(reduced, model_data) model_data['f'] = f.tolist() model_data['fvectors_train'] = reduced[:, f].tolist() return model_data
def process_training_data(train_page_names, noise='saltandpepper'): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names noise - String, default is "saltandpepper", other option is "gaussian" and is used to determine type of noise to use """ print('- Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('- Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) clean = fvectors_train_full[0::2] noisy = fvectors_train_full[1::2] model_data = dict() model_data['bbox_size'] = bbox_size # combine labels differently to match the way we use train data model_data['labels_train'] = np.concatenate( (labels_train[0::2], labels_train[1::2])).tolist() print('- Adding noise to a half of the data') if (noise == 'gaussian'): # Gaussian noise print('-- Gaussian noise') for i in range(noisy.shape[0]): gauss = np.random.normal(0, 0.1**0.5, (noisy[i].shape[0])).reshape( noisy[i].shape[0]) noisy[i] += gauss else: # Salt and pepper noise print('-- Salt and pepper noise') for i in range(noisy.shape[0]): # Makr a copy copy = noisy[i] # Convert to floats between and inclusive to 0 and 1 copy.astype(np.float16, copy=False) copy = np.multiply(copy, (1 / 255)) # Create some noise noise = np.random.randint(20, size=(copy.shape[0])) # When the noise has a zero, add a pepper to the copy copy = np.where(noise == 0, 0, copy) # pepper (black is = 0) # When the noise has a value equal to the top, add a salt to the copy copy = np.where(noise == (19), 1, copy) # salt (white is = 1) # Convert back to values out of 255 (RGB) noisy[i] = np.multiply(copy, (255)) print('- Reducing to 10 dimensions') fvectors_train_clean, fvectors_train_noisy = reduce_dimensions( np.concatenate((clean, noisy), axis=0), model_data, "Train", noisy.shape[0]) # add training clean and noisy samples together and save in model model_data['fvectors_train'] = np.concatenate( (fvectors_train_clean, fvectors_train_noisy)).tolist() return model_data
def load_test_page(page_name, model): """ Load test data page. This function must return each character as a 10-d feature vector with the vectors stored as rows of a matrix. Params: page_name - name of page file model - dictionary storing data passed from training stage """ bbox_size = model['bbox_size'] images_test = utils.load_char_images(page_name) images_test_final = [] # for every test image, apply a median filter to image to reduce noise # and store these new images as the images to use for testing for image in images_test: # img_contr = increase_contrast_image(image, 150) -- commented out as reduces accuracy noise_red = ndimage.median_filter(image, 3) images_test_final.append(noise_red) images_test_final = np.array(images_test_final) fvectors_test = images_to_feature_vectors(images_test_final, bbox_size) # Perform the dimensionality reduction. fvectors_test_reduced = reduce_dimensions_test(fvectors_test, model) return fvectors_test_reduced
def load_test_page(page_name, model): """Load test data page. This function must return each character as a 10-d feature vector with the vectors stored as rows of a matrix. Also as the noise on the pages is salt and pepper noise, a median filter is also applied to the test data to reduce some of the noise. An attempt of noise level detection has been attempted but not successful. This was going to be done in order to tune the KNN nearest neighbour according to the noise, so that the more noise on the page the bigger the KNN value. Params: page_name - name of page file model - dictionary storing data passed from training stage """ bbox_size = model['bbox_size'] images_test = utils.load_char_images(page_name) # For every row in images_test,reduce the noise reduced_noise = list(map(noise_reduction, images_test)) # Tried working out the noise # count=0 # for i in range (len(images_test)): # for x in range (len(images_test[i])): # if(images_test[i][x].shape != (0,) or reduced_noise[i][x].shape != (0,)): # count=count+mean_squared_error(reduced_noise[i][x], images_test[i][x]) # print(count) fvectors_test = images_to_feature_vectors(reduced_noise, bbox_size) # Perform the dimensionality reduction. fvectors_test_reduced = reduce_dimensions(fvectors_test, model) return fvectors_test_reduced
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size #get the eigenvector to get 20 principal components covx = np.cov(fvectors_train_full, rowvar=0) N = covx.shape[0] w, v = scipy.linalg.eigh(covx, eigvals=(N - 20, N - 1)) v = np.fliplr(v) #put this eigenvector into the dictionary to use it again for test_data model_data['eigenvector'] = v.tolist() print('Reducing to 10 dimensions') fvectors_train = reduce_dimensions(fvectors_train_full, model_data) #Tried 40 principal components but it seems not better ''' d12=np.zeros(40) indices = 9, 25 lowercase_list = list(string.ascii_lowercase) valid_characters = [i for j, i in enumerate(lowercase_list) if j not in indices] #extralist = ['l','’',',','.'] #finlist =valid_characters+extralist for char1 in valid_characters: char1_data = fvectors_train[labels_train==char1, :] for char2 in valid_characters: char2_data = fvectors_train[labels_train==char2, :] d12 += divergence(char1_data, char2_data) sorted_indexes = np.argsort(-d12) features = sorted_indexes[0:10] model_data['features'] = features.tolist() fvector_train_final = fvectors_train[:, features] model_data['fvectors_train'] = fvector_train_final.tolist() for 40 principal ''' model_data['fvectors_train'] = fvectors_train.tolist() return model_data
def load_page(page_name, model): """Load raw test data page. Params: page_name - name of page file model - dictionary storing data passed from training stage """ bbox_size = model['bbox_size'] images_test = utils.load_char_images(page_name) fvectors_test = images_to_feature_vectors(images_test, bbox_size) return fvectors_test
def load_test_page(page_name, model): """Load test data page. This function must return each character as a 10-d feature vector with the vectors stored as rows of a matrix. Params: page_name - name of page file model - dictionary storing data passed from training stage """ bbox_size = model['bbox_size'] images_test = utils.load_char_images(page_name) fvectors_test = images_to_feature_vectors(images_test, bbox_size) # Perform the dimensionality reduction. fvectors_test_reduced = reduce_dimensions(fvectors_test, model, 1) return fvectors_test_reduced
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ # Lecturer said that 'reading data' does not need to be modified # if you decide to let the letter be placed in the box from the left. # whereas it would need to be rewritten if you wishes to stretched the letter print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) # Testing Reading Data print("**** print:", labels_train) print("shape of labels_train:", labels_train.shape) #14395 labels read in as np.array print("length of images_train:", len(images_train)) # 14395 images read in as list # Extracts all features from training data - images --> featurevectors print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) # list of np.array fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) # Testing Extracting Features print("--- fvectors_train_full", fvectors_train_full, "shape:", fvectors_train_full.shape) # 2D Np array, shape - 14935, 2340 print("++ fvectors_train_full[0]:", fvectors_train_full[0], "image dimensions:", fvectors_train_full[0].shape) print("++ fvectors_train_full[1]:", fvectors_train_full[1], "image dimensions:", fvectors_train_full[1].shape) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size print('Reducing to 10 dimensions') # to be improved - reduce_dimensions fvectors_train = reduce_dimensions(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() return model_data
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size #with open('words.txt') as f: #dictionary = [word.rstrip() for word in f] # Subtract mean from all data points datamean = np.mean(fvectors_train_full) centered = fvectors_train_full - datamean # Project points onto PCA axes fvectors = np.dot(centered, doPCA(fvectors_train_full, 40)) # Get dictionary of words from text file dictionary = use_dictionary('words.txt') # Store W matrix from LDA model_data['lda'] = doLDA(fvectors, labels_train, 10).tolist() # Store PCA components into the model model_data['components'] = doPCA(fvectors_train_full, 40).tolist() # Create a new field for noise levels model_data['noise_levels'] = [] # Add dictionary of words to the model model_data['dict'] = dictionary print('Reducing to 10 dimensions') fvectors_train = reduce_dimensions(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() return model_data
def load_test_page(page_name, model): """Load test data page. This function must return each character as a 10-d feature vector with the vectors stored as rows of a matrix. Params: page_name - name of page file model - dictionary storing data passed from training stage """ bbox_size = model['bbox_size'] images_test = utils.load_char_images(page_name) if bbox_size is None: bbox_size = get_bounding_box_size(images_test) bbox_height, bbox_width = bbox_size maximum_noise = 0 # Calculate noise for this page for i, image in enumerate(images_test): padded_image = np.ones(bbox_size) * 255 height, width = image.shape width = min(width, bbox_width) height = min(height, bbox_height) p_img = padded_image[0:height, 0:width] img = image[0:height, 0:width] p_img = img noise = get_estimateNoise(padded_image) if i == 0: maximum_noise = noise else: if noise > maximum_noise: maximum_noise = noise noise = model['noise_levels'] # Add noise level estimates for each page noise.append(maximum_noise) noise = model['noise_levels'] fvectors_test = images_to_feature_vectors(images_test, bbox_size) # Perform the dimensionality reduction. fvectors_test_reduced = reduce_dimensions(fvectors_test, model) return fvectors_test_reduced
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) #Create a dictionary to store and return results of training stage model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size print('Reducing to 10 dimensions') #Here I compute the eigenvectovs of the covariance matrix using the training data, #to compute the first 40 principal components covx = np.cov(fvectors_train_full, rowvar=0) N = covx.shape[0] w, v = scipy.linalg.eigh(covx, eigvals=(N - 40, N - 1)) v = np.fliplr(v) #I then store the principal comonents "V" model_data['Principal_Components'] = v.tolist() #Gets a lsit of the ten chosen features and stores them in the dicitonarys model_data['features'] = choose_features(fvectors_train_full, model_data) print(model_data['features']) #Performs the dimentsionality reduiction of the training data fvectors_train = reduce_dimensions(fvectors_train_full, model_data) #Stores the training data after its dimensions have been reduced model_data['fvectors_train'] = fvectors_train.tolist() return model_data
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. This function acts as the training stage. The images are loaded, noise is added to about a third of the data set and is then reduced to simualte the process of noise removal. These images are then turned into vectors and PCA is used to reduce the dimensions to 10. Params: train_page_names - list of training page names Returns: model_data - a dictionary that contains all the information needed for the classification stage """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) print("Simulating noise removal") images = process_noise(images_train) fvectors_train_full = images_to_feature_vectors(images, bbox_size) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size print('Reducing to 10 dimensions via PCA') v = principal_components(fvectors_train_full, 11)[:, 1:11] model_data['v'] = v.tolist() model_data['mean'] = np.mean(fvectors_train_full).tolist() model_data['fvectors_train'] = np.dot( (fvectors_train_full - np.mean(fvectors_train_full)), v).tolist() print("Training has finished") return model_data
def load_test_page(page_name, model): """Load test data page. This function must return each character as a 10-d feature vector with the vectors stored as rows of a matrix. Params: page_name - name of page file model - dictionary storing data passed from training stage """ bbox_size = model['bbox_size'] images_test = utils.load_char_images(page_name) fvectors_test = images_to_feature_vectors(images_test, bbox_size) # Perform the dimensionality reduction. fvectors_test_reduced = reduce_dimensions(fvectors_test, model) fvectors_train = np.array(model['fvectors_train']) train_label = np.array(model['labels_train']) #I did the divergence step again to get the same feature columsn that i used for train data d12=np.zeros(20) #creating empty space for adding divergence below indices = 9, 25 #j and z that are not helpful for the divergence lowercase_list = list(string.ascii_lowercase) #remove j and z in the list valid_characters = [i for j, i in enumerate(lowercase_list) if j not in indices] #Tried to add some symbols for the divergence but no improvement #extralist = ['l','’',',','.'] #finlist =valid_characters+extralist for char1 in valid_characters: char1_data = fvectors_train[train_label==char1, :] for char2 in valid_characters: char2_data = fvectors_train[train_label==char2, :] d12 += divergence(char1_data, char2_data) #Find the 10 best features with the divergence calculated above sorted_indexes = np.argsort(-d12) features = sorted_indexes[0:10] #should return 10 columns always return fvectors_test_reduced[:, features]
def load_test_page(page_name, model): """Load test data page. This function must return each character as a 10-d feature vector with the vectors stored as rows of a matrix. It also saves in the model a determine on whether a page was noisy or not. If a page was determined to be noisy, a median-filter is applied which has shown to make characters on a page more clear for the classifier. Params: page_name - name of page file model - dictionary storing data passed from training stage """ bbox_size = model['bbox_size'] images_test = utils.load_char_images(page_name) fvectors_test = images_to_feature_vectors(images_test, bbox_size) # compute how noisy count = 0 for i in range(fvectors_test.shape[0]): count += np.sum(fvectors_test[i]) determine = count / (fvectors_test.shape[0] * fvectors_test.shape[1]) # denoise images by applying median filter if (determine < 239.0): for i in range(fvectors_test.shape[0]): fvectors_test[i] = ndimage.median_filter(fvectors_test[i], 3) # save the fact it was noisy if it was if 'test_noisy' in model: x = np.array(model['test_noisy']) if (determine < 239.0): x = np.append(x, True) else: x = np.append(x, False) model['test_noisy'] = x.tolist() else: if (determine < 239.0): model['test_noisy'] = [True] else: model['test_noisy'] = [False] # Perform the dimensionality reduction. fvectors_test_reduced = reduce_dimensions(fvectors_test, model, "Test") return fvectors_test_reduced
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. :param train_page_names: List of training page names :return: Dictionary storing the results """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) # # Add some noise to the training data (not much overall improvement) # for i in range(fvectors_train_full.shape[0]): # noise = np.random.randint(80, size=fvectors_train_full.shape[1]) # fvectors_train_full[i][:] = np.add(fvectors_train_full[i][:], noise) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size print('Reducing to 10 dimensions') fvectors_train = reduce_dimensions_train(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() print('Loading the word lists') dictionary = [] with open('data/train/dictionary.txt', 'r') as f: for line in f: dictionary.append(line.strip('\n')) model_data['dict'] = dictionary return model_data
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) model_data = dict() model_data['bbox_size'] = bbox_size word_lists = [get_word_lists(filename) for filename in WORD_FILE_NAMES] model_data['word_lists'] = word_lists try: fvectors_train_full, labels_train = increase_training_size(fvectors_train_full, labels_train, model_data) except: print("Failed to increase training set size. Will proceed with base training set size.") print('Reducing to 10 dimensions') #fvectors_train_full, labels_train = artificially_increase_trainingset_size(fvectors_train_full,labels_train) fvectors_train = reduce_dimensions(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() model_data['labels_train'] = labels_train.tolist() return model_data
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. The eigenvalues are computed here on the training data to be used for the PCA dimension reduction. The same V value is stored in the model so that it is reused again on the training data and not being recomputed again. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size covx = np.cov(fvectors_train_full, rowvar=0) N = covx.shape[0] w, v = scipy.linalg.eigh(covx, eigvals=(N - 10, N - 1)) computed_v = np.fliplr(v) model_data['computed_v'] = computed_v.tolist() # reading the words from the dictionary model_data['dictionary_words'] = [ word for line in open("wordsEn.txt", 'r') for word in line.split() ] print('Reducing to 10 dimensions') fvectors_train = reduce_dimensions(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() return model_data
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size # PCA adapted from the labs # Calculating the principal components covx = np.cov(fvectors_train_full, rowvar=0) N = covx.shape[0] w, v = scipy.linalg.eigh(covx, eigvals=(N - 40, N - 1)) v = np.fliplr(v) # Storing the eigenvectors in dictionary model_data['eigenvectors'] = v.tolist() print('Reducing to 10 dimensions') fvectors_train = reduce_dimensions(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() return model_data
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size model_data['unique_ratio'] = [] infile = open('../data/Extra/markov_pmatrix.pickle', 'rb') model_data['markov_states'] = pickle.load(infile) infile.close() wordFile = open('../data/Extra/wordlist.txt', 'r') model_data['words'] = [i.strip() for i in wordFile.readlines()] wordFile.close() print('Reducing to 10 dimensions') fvectors_train = reduce_dimensions(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() return model_data
def load_test_page(page_name, model): """Load test data page. This function must return each character as a 10-d feature vector with the vectors stored as rows of a matrix. Params: page_name - name of page file model - dictionary storing data passed from training stage Returns: fvectors_test_reduced - a 10-d feature vector with the vectors stored as rows of a matrix """ bbox_size = model['bbox_size'] images_test = utils.load_char_images(page_name) n = remove_noise(images_test) fvectors_test = images_to_feature_vectors(n, bbox_size) # Perform the dimensionality reduction. mean = np.array(model['mean']) v = np.array(model['v']) fvectors_test_reduced = np.dot((fvectors_test - mean), v) return fvectors_test_reduced
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size # initialise empty array to later update with eigenvectors model_data['eigenvector'] = np.array([]).tolist() # For PCA Dimension Reduction noise_dim = 50 # for noise reduction dim = 10 # for final dimension reduction model_data['noise_dim'] = noise_dim model_data['dim'] = dim print('Reducing to 10 dimensions') fvectors_train = reduce_dimensions(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() return model_data
def process_training_data(train_page_names): """Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ images_train = [] #initialise empty lists for train data and train labels labels_train = [] for page_name in train_page_names: #gets train data from model file using utils.py images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) #makes it to numpy array - 14395*1 vector labels_train = np.array(labels_train) #numpy array of train labels print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors( images_train, bbox_size) #shape: (14395,2340) #model data is a dictionary, key value assosciations #2 initial keys: labels_train and bbox size model_data = dict() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size #tuple (39,60) print('Extracting principal components') #store the principal components from the related function p_comp = principal_components(fvectors_train_full) #store principal components into the model by making them a list model_data['principal_comp'] = p_comp.tolist() #store external dictionary text file in the model #used for the error correction print('Getting the dictionary') with open('wiki-100k.txt', 'r') as wiki: words = wiki.readlines() words_length = [word.strip() for word in words] model_data['textFile'] = words_length #select the best features print("Select Best Features") selected_features = select_features(fvectors_train_full, model_data) model_data['selected_features'] = selected_features #fvectors_train_full has shape (14395, 2340) print('Initial dimensions: ' + str(fvectors_train_full.shape)) print('Reducing to 10 dimensions') fvectors_train = reduce_dimensions(fvectors_train_full, model_data) #fvectors_train has shape (14395, 10) print('Reduced dimensions: ' + str(fvectors_train.shape)) #store fvectors in model_data as list model_data['fvectors_train'] = fvectors_train.tolist() #return the model return model_data
def process_training_data(train_page_names): """ Perform the training stage and return results in a dictionary. Params: train_page_names - list of training page names """ print('Reading data') images_train = [] labels_train = [] images_train_final = [] for page_name in train_page_names: images_train = utils.load_char_images(page_name, images_train) labels_train = utils.load_labels(page_name, labels_train) # for every image, increase contrast and store these new images as # the images to use for training # for image in images_train: # img_contr = increase_contrast_image(image, 150) # images_train_final.append(img_contr) # images_train_final = np.array(images_train_final) labels_train = np.array(labels_train) print('Extracting features from training data') bbox_size = get_bounding_box_size(images_train) fvectors_train_full = images_to_feature_vectors(images_train, bbox_size) # take first half of full training vectors fvectors_train_fhalf = fvectors_train_full[:( math.floor((fvectors_train_full.shape[0]) / 2)), :] # create random 1D array with n features to be used as noise np.random.seed(2) noise = (np.random.rand(2340) * 100).astype(int) # add noise to half of training data images fvectors_train_fhalf = np.subtract(fvectors_train_fhalf, noise) # any pixel below 0 (black) set to 0 for i in range(len(fvectors_train_fhalf) - 1): for j in range(len(fvectors_train_fhalf[0]) - 1): if fvectors_train_fhalf[i][j] < 0: fvectors_train_fhalf[i][j] = 0 # for every noisy training images, apply a median filter to image to reduce # noise and store these new images as the images to use for training # (same filters applied to test images) fvectors_train_fhalf_final = [] for vector in fvectors_train_fhalf: # img_contr = increase_contrast_vector(vector, 150) -- commented out as it reduces accuracy noise_red = ndimage.median_filter(vector, 3) fvectors_train_fhalf_final.append(noise_red) fvectors_train_fhalf_final = np.array(fvectors_train_fhalf_final) # recreate full training vectors by stacking noisy images with second half of original full training vectors fvectors_train_shalf = fvectors_train_full[( math.floor((fvectors_train_full.shape[0]) / 2)):, :] fvectors_train_full = np.vstack( (fvectors_train_fhalf_final, fvectors_train_shalf)) model_data = dict() model_data['train_mean'] = np.mean(fvectors_train_full).tolist() model_data['labels_train'] = labels_train.tolist() model_data['bbox_size'] = bbox_size print('Reducing to 10 dimensions') # use PCA to get 40 eigenvectors of covariance matrix of all training vectors covx = np.cov(fvectors_train_full, rowvar=False) N = covx.shape[0] w, v = linalg.eigh(covx, eigvals=(N - 40, N - 1)) v = np.fliplr(v) model_data['v'] = v.tolist() fvectors_train = reduce_dimensions_train(fvectors_train_full, model_data) model_data['fvectors_train'] = fvectors_train.tolist() return model_data