def internal_comparisons_test(self, n_stolen, n_sighted, n_matches, quiet=False, seed=DEF_SEED): """ Test that the student has correctly counted the code against what we have counted. This does not mean that the count is correct, just that it was correctly counted. setting quiet = True means the feedback summary won't be printed, which is useful if using along with standard comparisons in a single test case. """ base_file = self.base_filename(n_stolen, n_sighted, n_matches, seed) (stolen, sighted, _) = utilities.read_dataset(TEST_FOLDER + base_file) start = time.perf_counter() _, student_count = self.matching_function(stolen, sighted) end = time.perf_counter() delta = end - start # prints student comparisons and time taken template = '{}, c={}, {:.4f}s' feedback = template.format(base_file, student_count, delta) if not quiet: print(feedback, end=' ... ') self.assertEqual(student_count, real_comparisons())
def run_simple_tests(): """ A nice place for your testing code """ filename = './test_data/10s-10000-10-a.txt' db_list, sighted_list, matches_list = utilities.read_dataset(filename) table_db_make = make_db_hash_table(db_list, 10, LinearHashTable) table_process = process_camera_stream(db_list, sighted_list, 20, 5)
def run_tests(): """ Use this function to run some simple tests to help with developing your awesome answer code. You should leave this out of your submission """ from utilities import read_dataset file_name = './test_data/0s-5-0-a.txt' stolen_list, sighted_list, matches = read_dataset(file_name) result_list = [] total_comparisons = 0 for plate in sighted_list: comparisons = search(stolen_list, plate, result_list) total_comparisons += comparisons return result_list, total_comparisons
def plates_test(self, n_stolen, n_sighted, n_matches, seed=DEF_SEED): """ Test that the given matching_function returns the correct result for the file specified by test_file_name. """ base_file = self.base_filename(n_stolen, n_sighted, n_matches, seed) stolen, sightings, expected_list = utilities.read_dataset(TEST_FOLDER + base_file) start = time.perf_counter() student_answer, comps = self.matching_function(stolen, sightings) end = time.perf_counter() delta = end - start print('{}, c={}, {:.4f}s'.format(base_file, comps, delta), end=' ... ') self.assertEqual(student_answer, expected_list) if len(student_answer) > 0: self.assertTypesEqual(student_answer[0], expected_list[0])
def process_stream_test(self, n_items, n_sighted, n_matches, db_table_size, results_table_size): """ Note: The condensed_str of results is used when results_table_size > 100 """ self.setUp() # get expected db table base_file = self.base_filename(n_items, n_sighted, n_matches) test_file_name = TEST_FOLDER + base_file database_list, sightings, matches = utilities.read_dataset( test_file_name) expected_db_part = '' template = '{}expected_db_linear_{}-{}.txt' txtless_basefile = base_file[:-4] # strips the .txt expected_db_file_name = template.format(TEST_FOLDER, txtless_basefile, db_table_size) expected_db_table_str = utilities.read_expected(expected_db_file_name) # get expected results table template = '{}expected_results_table_{}-{}-{}.txt' txtless_basefile = base_file[:-4] # strips the .txt expected_results_file_name = template.format(TEST_FOLDER, txtless_basefile, db_table_size, results_table_size) expected_results_table_str = utilities.read_expected( expected_results_file_name) # get db table and results table database, results = process_camera_stream(database_list, sightings, db_table_size, results_table_size) self.assertEqual(str(database), expected_db_table_str) # expected results for tables larger than 100 will be in condensed form. if results_table_size <= 100: self.assertEqual(str(results), expected_results_table_str) else: self.assertEqual(results.condensed_str(), expected_results_table_str) return True
def run_main(method, fs_functions, score_name, n_clfs=5, dataset_name="PC4"): print("\nDATASET: %s\nMETHOD: %s\n" % (dataset_name, method)) np.random.seed(1) ##### 1. ------ GET DATASET X, y, ft_names = ut.read_dataset("datasets/", dataset_name=dataset_name) ##### 2. ------- RUN TRANING METHOD methods.run_method(method, X, y, n_clfs=n_clfs, fs_functions=fs_functions, score_name=score_name) pl.title(dataset_name) pl.ylabel(score_name) pl.legend(loc="best") img = BytesIO() pl.savefig(img) img.seek(0) return img
def comparisons_test(self, n_stolen, n_sighted, n_matches, expected=None, seed=DEF_SEED): """ Test that the number of comparisons that the student made is within the expected bounds (provided by self.get_bounds, or expected) """ base_file = self.base_filename(n_stolen, n_sighted, n_matches, seed) stolen, sighted, _ = utilities.read_dataset(TEST_FOLDER + base_file) start = time.perf_counter() _, student_count = self.matching_function(stolen, sighted) end = time.perf_counter() delta = end - start print('{}, c={}, {:.4f}s'.format(base_file, student_count, delta), end=' ... ') if expected is not None: self.assertEqual(student_count, expected) else: self.check_comparisons_within_bounds(student_count, len(stolen), len(sighted), n_matches)
def make_db_hash_table_test(self, n_items, n_sighted, n_matches, n_slots): base_file = self.base_filename(n_items, n_sighted, n_matches) test_file_name = TEST_FOLDER + base_file database_list, sightings, matches = utilities.read_dataset( test_file_name) if self.table_class == LinearHashTable: expected_part = 'expected_db_linear_' elif self.table_class == ChainingHashTable: expected_part = 'expected_db_chaining_' else: expected_part = 'expected_db_table_list_' template = '{}{}{}-{}.txt' txtless_basefile = base_file[:-4] # strips the .txt expected_file_name = template.format(TEST_FOLDER, expected_part, txtless_basefile, n_slots) with open(expected_file_name) as expected_file: expected_table_str = expected_file.read() database = make_db_hash_table(database_list, n_slots, table_class=self.table_class) self.assertEqual(str(database), expected_table_str) return True
parser.add_argument('-d', '--dataset_name', default="ant") parser.add_argument('-n', '--n_clfs', default=5, type=int) parser.add_argument('-s', '--score_name', required=True, choices=["auc","gmeans"]) args = parser.parse_args() method = args.method dataset_name = args.dataset_name fs_functions = args.fs_functions n_clfs = args.n_clfs score_name = args.score_name print("\nDATASET: %s\nMETHOD: %s\n" % (dataset_name, method)) np.random.seed(1) ##### 1. ------ GET DATASET X, y, ft_names = ut.read_dataset("datasets/", dataset_name=dataset_name) pl.title(dataset_name) pl.ylabel("AUC") ##### 2. ------- RUN TRANING METHOD methods.run_method(method, X, y, n_clfs=n_clfs, fs_functions=fs_functions, score_name=score_name) pl.legend(loc="best") pl.show()
import utilities import os import operator inverted_index_file = "inverted_index_5000" frequency_file = "frequency_5000.csv" #read the dataset into a dictionary inverted_dict = {} inverted_dict = utilities.read_dataset(inverted_index_file) #Approach 1 : padding keywords such that # they have the same result length as the most frequent keyword and then identify overhead . # find the most frequency length & find the real amount of ids longest = 0 for id_list in inverted_dict.values(): if len(id_list) > longest: longest = len(id_list) # append pid & count real & padding count_real = 0 count_padding = 0 for id_list in inverted_dict.values(): count_real += len(id_list) if len(id_list) < longest: diff = longest - len(id_list) for i in range(diff): # id_list.add(utilities.generate_random_id()) # because the program runs so slow, we replace the generator with i. But in real case, we will use generate_random_id(). id_list.add(i) count_padding += 1
transform_sqrt=True, visualize=True, block_norm="L1") return hist if __name__ == '__main__': # to hold feature vector for all images features = [] total_images = 0 # save all feature vectors + label in a csv file with open("..\\features.csv", 'w', newline="") as f: writer = csv.writer(f) # loop over the training images images_paths, labels = utils.read_dataset(args["training"]) for image_path in images_paths: image = cv2.imread(image_path) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) lbp_hist = get_lbp(gray) rot_lbp_hist = get_rot_lbp(gray) hog_hist = get_hog(gray) label = image_path.split("\\")[-2] features.append( np.concatenate((lbp_hist, rot_lbp_hist, hog_hist, label), axis=None)) writer.writerows(features) features = [] total_images = total_images + 1
parser.add_argument('-n', '--n_clfs', default=5, type=int) parser.add_argument('-p', '--problem', default="multiclassification", choices=["multiclassification", "ovr"]) parser.add_argument('-u', '--unbalanced', default=False) args = parser.parse_args() method = args.method fs_functions = args.fs_functions n_clfs = args.n_clfs problem = args.problem use_unbalanced_data = args.unbalanced np.random.seed(1) # GET DATASET feature_matrix, facies_vector = ut.read_dataset() # draw_data_histogram(facies_vector) ut.get_feature_statistics(feature_matrix) # Preprocessing feature_matrix = preprocessing.normalize(feature_matrix, facies_vector) # convert the dataset to be a binary classification problem for class x over rest if problem == "ovr": facies_vector = ut.convert_to_binary_classification(2, facies_vector) if not use_unbalanced_data: feature_matrix, facies_vector = preprocessing.balance(feature_matrix, facies_vector) # RUN TRANING METHOD print("Method: ", method)