def internal_comparisons_test(self,
                                  n_stolen,
                                  n_sighted,
                                  n_matches,
                                  quiet=False,
                                  seed=DEF_SEED):
        """ Test that the student has correctly counted the code against what
            we have counted. This does not mean that the count is correct, just
            that it was correctly counted.
            setting quiet = True means the feedback summary won't be printed,
            which is useful if using along with standard comparisons in
            a single test case.
        """
        base_file = self.base_filename(n_stolen, n_sighted, n_matches, seed)
        (stolen, sighted, _) = utilities.read_dataset(TEST_FOLDER + base_file)

        start = time.perf_counter()
        _, student_count = self.matching_function(stolen, sighted)
        end = time.perf_counter()
        delta = end - start

        # prints student comparisons and time taken
        template = '{}, c={}, {:.4f}s'
        feedback = template.format(base_file, student_count, delta)
        if not quiet:
            print(feedback, end=' ... ')

        self.assertEqual(student_count, real_comparisons())
def run_simple_tests():
    """ A nice place for your testing code """
    filename = './test_data/10s-10000-10-a.txt'
    db_list, sighted_list, matches_list = utilities.read_dataset(filename)

    table_db_make = make_db_hash_table(db_list, 10, LinearHashTable)
    table_process = process_camera_stream(db_list, sighted_list, 20, 5)
def run_tests():
    """ Use this function to run some simple tests 
    to help with developing your awesome answer code.
    You should leave this out of your submission """
    from utilities import read_dataset
    file_name = './test_data/0s-5-0-a.txt'
    stolen_list, sighted_list, matches = read_dataset(file_name)

    result_list = []
    total_comparisons = 0

    for plate in sighted_list:
        comparisons = search(stolen_list, plate, result_list)
        total_comparisons += comparisons

    return result_list, total_comparisons
    def plates_test(self, n_stolen, n_sighted, n_matches, seed=DEF_SEED):
        """ Test that the given matching_function returns the correct
            result for the file specified by test_file_name.
        """
        base_file = self.base_filename(n_stolen, n_sighted, n_matches, seed)
        stolen, sightings, expected_list = utilities.read_dataset(TEST_FOLDER +
                                                                  base_file)

        start = time.perf_counter()
        student_answer, comps = self.matching_function(stolen, sightings)
        end = time.perf_counter()
        delta = end - start
        print('{}, c={}, {:.4f}s'.format(base_file, comps, delta), end=' ... ')

        self.assertEqual(student_answer, expected_list)
        if len(student_answer) > 0:
            self.assertTypesEqual(student_answer[0], expected_list[0])
示例#5
0
    def process_stream_test(self, n_items, n_sighted, n_matches, db_table_size, results_table_size):
        """ Note: The condensed_str of results is used when results_table_size > 100 """
        self.setUp()
        # get expected db table
        base_file = self.base_filename(n_items, n_sighted, n_matches)
        test_file_name = TEST_FOLDER + base_file
        database_list, sightings, matches = utilities.read_dataset(
            test_file_name)
        expected_db_part = ''
        template = '{}expected_db_linear_{}-{}.txt'
        txtless_basefile = base_file[:-4]  # strips the .txt
        expected_db_file_name = template.format(TEST_FOLDER,
                                                txtless_basefile,
                                                db_table_size)

        expected_db_table_str = utilities.read_expected(expected_db_file_name)

        # get expected results table
        template = '{}expected_results_table_{}-{}-{}.txt'
        txtless_basefile = base_file[:-4]  # strips the .txt
        expected_results_file_name = template.format(TEST_FOLDER,
                                                     txtless_basefile,
                                                     db_table_size,
                                                     results_table_size)
        expected_results_table_str = utilities.read_expected(
            expected_results_file_name)

        # get db table and results table
        database, results = process_camera_stream(database_list,
                                                  sightings,
                                                  db_table_size,
                                                  results_table_size)

        self.assertEqual(str(database), expected_db_table_str)

        # expected results for tables larger than 100 will be in condensed form.
        if results_table_size <= 100:
            self.assertEqual(str(results), expected_results_table_str)
        else:
            self.assertEqual(results.condensed_str(),
                             expected_results_table_str)
        return True
示例#6
0
文件: main.py 项目: Charmnut/ML_in_SE
def run_main(method, fs_functions, score_name, n_clfs=5, dataset_name="PC4"):

    print("\nDATASET: %s\nMETHOD: %s\n" % (dataset_name, method))
    np.random.seed(1)

    ##### 1. ------ GET DATASET
    X, y, ft_names = ut.read_dataset("datasets/", dataset_name=dataset_name)
    ##### 2. ------- RUN TRANING METHOD
    methods.run_method(method,
                       X,
                       y,
                       n_clfs=n_clfs,
                       fs_functions=fs_functions,
                       score_name=score_name)
    pl.title(dataset_name)
    pl.ylabel(score_name)

    pl.legend(loc="best")
    img = BytesIO()
    pl.savefig(img)
    img.seek(0)
    return img
    def comparisons_test(self,
                         n_stolen,
                         n_sighted,
                         n_matches,
                         expected=None,
                         seed=DEF_SEED):
        """ Test that the number of comparisons that the student made is
            within the expected bounds (provided by self.get_bounds, or expected)
        """
        base_file = self.base_filename(n_stolen, n_sighted, n_matches, seed)
        stolen, sighted, _ = utilities.read_dataset(TEST_FOLDER + base_file)

        start = time.perf_counter()
        _, student_count = self.matching_function(stolen, sighted)
        end = time.perf_counter()
        delta = end - start
        print('{}, c={}, {:.4f}s'.format(base_file, student_count, delta),
              end=' ... ')

        if expected is not None:
            self.assertEqual(student_count, expected)
        else:
            self.check_comparisons_within_bounds(student_count, len(stolen),
                                                 len(sighted), n_matches)
示例#8
0
 def make_db_hash_table_test(self, n_items, n_sighted, n_matches, n_slots):
     base_file = self.base_filename(n_items, n_sighted, n_matches)
     test_file_name = TEST_FOLDER + base_file
     database_list, sightings, matches = utilities.read_dataset(
         test_file_name)
     if self.table_class == LinearHashTable:
         expected_part = 'expected_db_linear_'
     elif self.table_class == ChainingHashTable:
         expected_part = 'expected_db_chaining_'
     else:
         expected_part = 'expected_db_table_list_'
     template = '{}{}{}-{}.txt'
     txtless_basefile = base_file[:-4]  # strips the .txt
     expected_file_name = template.format(TEST_FOLDER,
                                          expected_part,
                                          txtless_basefile,
                                          n_slots)
     with open(expected_file_name) as expected_file:
         expected_table_str = expected_file.read()
     database = make_db_hash_table(database_list,
                                   n_slots,
                                   table_class=self.table_class)
     self.assertEqual(str(database), expected_table_str)
     return True
示例#9
0
    parser.add_argument('-d', '--dataset_name', default="ant")

    parser.add_argument('-n', '--n_clfs', default=5, type=int)

    parser.add_argument('-s', '--score_name', required=True, 
                        choices=["auc","gmeans"])

    args = parser.parse_args()      
    method = args.method
    dataset_name = args.dataset_name
    fs_functions = args.fs_functions
    n_clfs = args.n_clfs
    score_name = args.score_name

    print("\nDATASET: %s\nMETHOD: %s\n" % (dataset_name, method))
    np.random.seed(1)


    ##### 1. ------ GET DATASET
    X, y, ft_names = ut.read_dataset("datasets/", dataset_name=dataset_name)
    pl.title(dataset_name)
    pl.ylabel("AUC")

    ##### 2. ------- RUN TRANING METHOD
    methods.run_method(method, X, y, n_clfs=n_clfs, 
                       fs_functions=fs_functions, 
                       score_name=score_name)

    pl.legend(loc="best")
    pl.show()
import utilities
import os
import operator

inverted_index_file = "inverted_index_5000"
frequency_file = "frequency_5000.csv"    

#read the dataset into a dictionary
inverted_dict = {}    
inverted_dict = utilities.read_dataset(inverted_index_file)

#Approach 1 : padding keywords such that
# they have the same result length as the most frequent keyword and then identify overhead .
# find the most frequency length & find the real amount of ids
longest = 0
for id_list in inverted_dict.values():
    if len(id_list) > longest:
        longest = len(id_list)
    
# append pid & count real & padding
count_real = 0
count_padding = 0
for id_list in inverted_dict.values():
    count_real += len(id_list)
    if len(id_list) < longest:
        diff = longest - len(id_list)
        for i in range(diff):
            # id_list.add(utilities.generate_random_id())
            # because the program runs so slow, we replace the generator with i. But in real case, we will use generate_random_id().
            id_list.add(i)
            count_padding += 1
                          transform_sqrt=True,
                          visualize=True,
                          block_norm="L1")
    return hist


if __name__ == '__main__':
    # to hold feature vector for all images
    features = []
    total_images = 0
    # save all feature vectors + label in a csv file
    with open("..\\features.csv", 'w', newline="") as f:
        writer = csv.writer(f)

        # loop over the training images
        images_paths, labels = utils.read_dataset(args["training"])
        for image_path in images_paths:
            image = cv2.imread(image_path)
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

            lbp_hist = get_lbp(gray)
            rot_lbp_hist = get_rot_lbp(gray)
            hog_hist = get_hog(gray)
            label = image_path.split("\\")[-2]
            features.append(
                np.concatenate((lbp_hist, rot_lbp_hist, hog_hist, label),
                               axis=None))
            writer.writerows(features)
            features = []
            total_images = total_images + 1
    parser.add_argument('-n', '--n_clfs', default=5, type=int)

    parser.add_argument('-p', '--problem', default="multiclassification",
                        choices=["multiclassification", "ovr"])
    parser.add_argument('-u', '--unbalanced', default=False)

    args = parser.parse_args()
    method = args.method
    fs_functions = args.fs_functions
    n_clfs = args.n_clfs
    problem = args.problem
    use_unbalanced_data = args.unbalanced
    np.random.seed(1)

    # GET DATASET
    feature_matrix, facies_vector = ut.read_dataset()
    # draw_data_histogram(facies_vector)
    ut.get_feature_statistics(feature_matrix)

    # Preprocessing
    feature_matrix = preprocessing.normalize(feature_matrix, facies_vector)

    # convert the dataset to be a binary classification problem for class x over rest
    if problem == "ovr":
        facies_vector = ut.convert_to_binary_classification(2, facies_vector)

    if not use_unbalanced_data:
        feature_matrix, facies_vector = preprocessing.balance(feature_matrix, facies_vector)

    # RUN TRANING METHOD
    print("Method: ", method)