Пример #1
0
    def test_hardy_main(self):

        run.hardy_main(data_path,
                       tform_config_path,
                       config_path,
                       iterator_mode='arrays',
                       classifier='tuner',
                       num_test_files_class=1,
                       classes=['noise', 'one'],
                       split=0.5,
                       batch_size=1,
                       project_name='test_wrapper')
        output_path = preprocessing.save_to_folder(data_path, 'test_wrapper',
                                                   'test_1')
        report_dir = output_path + 'report/'
        report_location = os.listdir(report_dir)
        for item in report_location:
            if item.endswith('.yaml'):
                with open(report_dir + item, 'r') as file:
                    report = yaml.load(file, Loader=yaml.FullLoader)
                    assert isinstance(report, dict),\
                        'The filetype returned in not a dictionary'
        shutil.rmtree('./hardy/test/test_data/test_wrapper')

        # use k-fold validation
        run.hardy_main(data_path,
                       tform_config_path,
                       config_path,
                       k_fold=True,
                       k=2,
                       iterator_mode='arrays',
                       classifier='cnn',
                       num_test_files_class=1,
                       classes=['noise', 'one'],
                       batch_size=1,
                       project_name='test_wrapper')
        output_path = preprocessing.save_to_folder(data_path, 'test_wrapper',
                                                   'test_1')
        report_dir = output_path + 'report/'
        report_location = os.listdir(report_dir)
        for item in report_location:
            if item.endswith('.yaml'):
                with open(report_dir + item, 'r') as file:
                    report = yaml.load(file, Loader=yaml.FullLoader)
                    assert isinstance(report, dict),\
                        'The filetype returned in not a dictionary'
        shutil.rmtree('./hardy/test/test_data/test_wrapper')
        pass
Пример #2
0
 def test_classifier_wrapper(self):
     num_files = 3
     run_name = 'test_1'
     config_path = './hardy/test/'
     output_path = preprocessing.save_to_folder(path, 'test_classifier',
                                                run_name)
     test_set_filenames = preprocessing.hold_out_test_set(
         data_path, number_of_files_per_class=num_files)
     run.classifier_wrapper(path,
                            test_set_filenames,
                            run_name,
                            config_path,
                            classifier='cnn',
                            iterator_mode='folder',
                            split=split,
                            classes=classes,
                            image_path=path,
                            project_name='test_classifier')
     report_dir = output_path + 'report/'
     report_location = os.listdir(report_dir)
     for item in report_location:
         if item.endswith('.yaml'):
             with open(report_dir + item, 'r') as file:
                 report = yaml.load(file, Loader=yaml.FullLoader)
                 assert isinstance(report, dict),\
                     'The filetype returned in not a dictionary'
     # remove report files after checking they were
     # correctly created
     shutil.rmtree(path + 'test_classifier/')
     print('the result folder was correctly deleted after testing')
     pass
Пример #3
0
def classifier_wrapper(input_path,
                       test_set_filenames,
                       run_name,
                       config_path,
                       classifier='tuner',
                       iterator_mode='arrays',
                       split=0.1,
                       target_size=(80, 80),
                       color_mode='rgb',
                       batch_size=32,
                       image_path=None,
                       classes=['class_1', 'class_2'],
                       project_name='tuner_run',
                       k_fold=False,
                       k=None,
                       **kwarg):
    '''
    Single "Universal" Wrapping function to setup and run the CNN and Tuner
    on any properly labeled image set.

    Operates in either of two formats:
        "arrays"  : Takes data as "List_of_Image_Tuples"
        "else"    : Takes data as "image_path" of sorted image folders

    Parameters:
    -----------
    input_datapath : str
                   path to the raw .csv files containing the data to classify
    test_set_filenames : list
                         The list containig the strings of filenames
                         randomly selected to be part of the test set.
    config_path : str
                  string containing the path to the yaml file
                  representing the classifier hyperparameters
    run_name : str
               name use to create a folder for storing the results of this run
    iterator_mode : str
                    option to use images from arrays directly or save the
                    .png and use a directory iterator mode
    plot_format : str
                  option for standard or RGB color gradient
    print_out : bool
                option for printing out feedback on conputational time taken to
                initialize the data and generate the images
    num_test_files_class : int or float
                            numebr of files per class to select for the test
                            set
    classifier : str
                  option cnn or tuner
    split : float
            the percentage of the learning set to use for the validation step
    target_size : tuple
                  image target size. Presented as a tuble indicating number of
                  pixels composing the two dimensions of the image (w x h)
    batch_size : int
                 The number of files to group up into a batch

    classes : list
              A list containing strings of the classes the data is divided in.
              The class name represent the folder name the files are contained
              in.
    project_name : str
                   name of the folder to be created for storing the results of
                   the tuning
    '''

    if iterator_mode == 'arrays':
        # loading pickled data

        image_data = handling.pickled_data_loader(input_path, run_name)

        assert image_data, 'No image_data list provided'

        test_set_list, learning_set_list = to_catalogue.data_set_split(
            image_data, test_set_filenames)

        if k_fold:
            test_set = to_catalogue.test_set(image_list=test_set_list,
                                             target_size=target_size,
                                             classes=classes,
                                             color_mode=color_mode,
                                             iterator_mode='arrays',
                                             batch_size=batch_size)
        else:
            training_set, validation_set = to_catalogue.learning_set(
                image_list=learning_set_list,
                split=split,
                classes=classes,
                target_size=target_size,
                iterator_mode='arrays',
                batch_size=batch_size,
                color_mode=color_mode)

            test_set = to_catalogue.test_set(image_list=test_set_list,
                                             target_size=target_size,
                                             classes=classes,
                                             color_mode=color_mode,
                                             iterator_mode='arrays',
                                             batch_size=batch_size)
    else:

        assert image_path, 'no path to the image folders was provided'

        training_set, validation_set = to_catalogue.learning_set(
            image_path,
            split=split,
            target_size=target_size,
            iterator_mode='from_directory',
            batch_size=batch_size,
            classes=classes)

        test_set = to_catalogue.test_set(image_path,
                                         target_size=target_size,
                                         classes=classes,
                                         iterator_mode='from_directory',
                                         batch_size=batch_size)
    if k_fold:
        print('test set : {} batches of {} files'.format(
            len(test_set), batch_size))
    else:
        print('training set : {} batches of {} files'.format(
            len(training_set), batch_size))
        print('validation set : {} batches of {} files'.format(
            len(validation_set), batch_size))
        print('test set : {} batches of {} files'.format(
            len(test_set), batch_size))

    if classifier == 'tuner':
        # warn search_function, 'no search function provided,
        # using default RandomSearch'
        tuner.build_param(config_path)
        output_path = preprocessing.save_to_folder(input_path, project_name,
                                                   run_name)
        tuned_model = tuner.run_tuner(training_set,
                                      validation_set,
                                      project_name=output_path)
        model, history, metrics = tuner.best_model(tuned_model, training_set,
                                                   validation_set, test_set)
        conf_matrix, report = cnn.report_on_metrics(model, test_set)
        tuner.report_generation(model,
                                history,
                                metrics,
                                output_path,
                                tuner=tuned_model,
                                save_model=True)
    else:
        if k_fold:

            assert k, 'the number of folds needs to be provided'
            validation_score, model, history, final_score = \
                cnn.k_fold_model(k, config_path=config_path,
                                 target_size=target_size,
                                 classes=classes, batch_size=batch_size,
                                 color_mode=color_mode,
                                 iterator_mode=iterator_mode,
                                 image_list=learning_set_list,
                                 test_set=test_set)
            output_path = preprocessing.save_to_folder(input_path,
                                                       project_name, run_name)
            conf_matrix, report = cnn.report_on_metrics(model, test_set)
            tuner.report_generation(model,
                                    history,
                                    final_score,
                                    output_path,
                                    tuner=None,
                                    save_model=True,
                                    config_path=config_path,
                                    k_fold=k_fold,
                                    k=k)

        else:
            model, history = cnn.build_model(training_set,
                                             validation_set,
                                             config_path=config_path)
            metrics = cnn.evaluate_model(model, test_set)

            output_path = preprocessing.save_to_folder(input_path,
                                                       project_name, run_name)
            conf_matrix, report = cnn.report_on_metrics(model, test_set)
            tuner.report_generation(model,
                                    history,
                                    metrics,
                                    output_path,
                                    tuner=None,
                                    save_model=True,
                                    config_path=config_path)

    if iterator_mode == 'arrays':
        performance_evaluation = reporting.model_analysis(
            model, test_set, test_set_list)
        performance_evaluation.to_csv(output_path +
                                      'report/model_evaluation.csv')
    else:
        performance_evaluation = reporting.model_analysis(model, test_set)

        performance_evaluation.to_csv(output_path +
                                      'report/model_evaluation.csv')
    return
Пример #4
0
 def test_save_to_folder(self):
     folder_path = preprocessing.save_to_folder(
         data_path, 'test_project', 'test_1')
     assert os.path.exists(folder_path), \
         'the fodler was not correctly created'
     pass
Пример #5
0
def data_wrapper(run_name=None,
                 raw_datapath='./',
                 tform_command_dict=None,
                 classes=None,
                 plot_format="RGBrgb",
                 iterator_mode='arrays',
                 print_out=True,
                 project_name=None,
                 skiprows=0,
                 scale=1.0):
    """
    Overall "One-Click" Wrapper to create the three "Keras Ready" Datasets
        needed to train the model: "Training Set", "Validation Set" and
        "Test Set", all in the same format which is created via the
        Keras.Preprocessing.Data.Flow (<--- Not exact package/function)

    """
    # replace the tform_commands with tform_command_dict in the arguments
    # to make parallel processing possible

    tform_commands = tform_command_dict[run_name]

    if print_out:
        clock = time.perf_counter()
        print("Processing Data...\t", end="")
    # Make the raw Dataframe Tuples List
    raw_tuples_list = to_catalogue._data_tuples_from_fnames(raw_datapath,
                                                            classes=classes,
                                                            skiprows=skiprows)

    # Now perform trasnsform if given
    if tform_commands is None:
        tform_tuples_list = raw_tuples_list
    else:
        tform_tuples_list = arbitrage.tform_tuples(raw_tuples_list,
                                                   tform_commands,
                                                   rgb_format=plot_format)
    # save the tranformation info in a yaml file for final report
    if project_name and run_name:
        output = [[i, name.split('__')[0],
                   name.split('__')[-1]]
                  for i, name in enumerate(list(tform_tuples_list[0][1]))
                  if isinstance(name, str)]
        # save the tranform info in a dictionary
        run_tform = {'run_name': run_name}
        for i in range(len(output)):
            run_tform['tform_' + str(i)] = output[i]
        # generate a yaml file to store the trasnformation info
        output_path = preprocessing.save_to_folder(raw_datapath, project_name,
                                                   run_name)
        report_location = output_path + '/report/'
        if not os.path.exists(report_location):
            os.makedirs(report_location)
        with open(report_location + 'run_tform_config.yaml', 'w') as yaml_file:
            yaml.dump(run_tform, yaml_file)
            yaml_file.close()
    else:
        pass
    # Next make the rgb images Tuples List
    if plot_format == 'RGBrgb':
        data_store = raw_datapath + run_name + '.pkl'
        to_catalogue.rgb_list(tform_tuples_list,
                              scale=scale,
                              plot_format=plot_format,
                              storage_location=data_store)
    else:
        data_store = raw_datapath + run_name + '.pkl'
        to_catalogue.regular_plot_list(tform_tuples_list,
                                       scale=scale,
                                       storage_location=data_store)

    # OK! Now we have image arrays finished!
    #     EITHER Return that list of image tuples
    #     OR save images and Return the path to those folders!
    if iterator_mode == 'arrays':
        if print_out:
            print_time(time.perf_counter() - clock)
        return 0
    else:
        # Write Optional Split based on Iterator_Mode,
        # to optionally use the "to_dirFlow"
        # path options (Already partly written!)...
        return os.path.join(raw_datapath, "images")