예제 #1
0
 def test_classifier_wrapper(self):
     num_files = 3
     run_name = 'test_1'
     config_path = './hardy/test/'
     output_path = preprocessing.save_to_folder(path, 'test_classifier',
                                                run_name)
     test_set_filenames = preprocessing.hold_out_test_set(
         data_path, number_of_files_per_class=num_files)
     run.classifier_wrapper(path,
                            test_set_filenames,
                            run_name,
                            config_path,
                            classifier='cnn',
                            iterator_mode='folder',
                            split=split,
                            classes=classes,
                            image_path=path,
                            project_name='test_classifier')
     report_dir = output_path + 'report/'
     report_location = os.listdir(report_dir)
     for item in report_location:
         if item.endswith('.yaml'):
             with open(report_dir + item, 'r') as file:
                 report = yaml.load(file, Loader=yaml.FullLoader)
                 assert isinstance(report, dict),\
                     'The filetype returned in not a dictionary'
     # remove report files after checking they were
     # correctly created
     shutil.rmtree(path + 'test_classifier/')
     print('the result folder was correctly deleted after testing')
     pass
예제 #2
0
    def test_model_analysis(self):

        num_files = 1
        data_tups = catalogue._data_tuples_from_fnames(input_path=data_path)
        data_storage = data_path + 'test_1.pkl'
        catalogue.rgb_list(data_tups, storage_location=data_storage)
        plot_tups = handling.pickled_data_loader(data_path, 'test_1')

        test_set_filenames = preprocessing.hold_out_test_set(
            data_path, number_of_files_per_class=num_files)

        test_set_list, learning_set_list = catalogue.data_set_split(
            plot_tups, test_set_filenames)
        train, val = catalogue.learning_set(image_list=learning_set_list,
                                            split=split,
                                            classes=['noise', 'one'],
                                            iterator_mode='arrays')
        testing_set = catalogue.test_set(image_list=test_set_list,
                                         classes=['noise', 'one'],
                                         iterator_mode='arrays')
        model, history = cnn.build_model(train,
                                         val,
                                         config_path='./hardy/test/')

        result = reporting.model_analysis(model, testing_set, test_set_list)

        assert isinstance(result, pd.DataFrame)
예제 #3
0
 def test_hold_out_test_set(self):
     num_files = 2
     # Frm .csv files
     test_set_filenames = preprocessing.hold_out_test_set(
         data_path, number_of_files_per_class=num_files)
     assert isinstance(test_set_filenames, list), 'format should be a list'
     assert len(test_set_filenames) == 2*num_files, \
         'the test set is not the correct length'
     assert len(np.unique(test_set_filenames)) == 2*num_files,\
         'the files selected to compose the test set should be unique'
     pass
예제 #4
0
 def test_set_folder(self):
     num_files = 5
     # Frm .csv files
     test_set_filenames = preprocessing.hold_out_test_set(
         data_path, number_of_files_per_class=num_files)
     test_folder = preprocessing.test_set_folder(
         data_path, test_set_filenames)
     assert isinstance(test_folder, str), \
         'the return should be the path to the test folder'
     assert os.path.exists(test_folder), \
         'the test folder was not correctly created'
     pass
예제 #5
0
    def test_data_split(self):
        num_files = 3
        data_tups = catalogue._data_tuples_from_fnames(input_path=data_path)
        data_storage = data_path + 'test_1' + '.pkl'
        catalogue.rgb_list(data_tups, storage_location=data_storage)
        plot_tups = handling.pickled_data_loader(data_path, 'test_1')

        test_set_filenames = preprocessing.hold_out_test_set(
            data_path, number_of_files_per_class=num_files)
        test_set_list, learning_set_list = catalogue.data_set_split(
            plot_tups, test_set_filenames)
        assert isinstance(test_set_filenames, list), 'format should be a list'
        assert len(test_set_filenames) == 2*num_files, \
            'the test set is not the correct length'
        assert isinstance(test_set_list, list), 'format should be a list'
        assert isinstance(learning_set_list, list), 'format should be a list'
        pass
예제 #6
0
def checkpoint_datacreation(  # Data and Config Paths
        raw_datapath,
        tform_config_path,
        classifier_config_path,
        # Optional for Data
        iterator_mode='arrays',
        plot_format="RGBrgb",
        print_out=True,
        skiprows=0,
        # Optional for Classifier
        num_test_files_class=300,
        scale=1.0,
        classifier='tuner',
        split=0.1,
        target_size=(80, 80),
        batch_size=32,
        classes=['class_1', 'class_2'],
        project_name='tuner_run',
        k_fold=False,
        k=None,
        color_mode='rgb',
        seed=None):
    ''' Function that is part of run_hardy main module, it is run by
    checkrun function to evaluate the time and memory required by the
    preprocessing step.

    Parameters
    ----------
    raw_datapath : str
                   path to the raw .csv files containing the data to classify
    tform_config_path : str
                        string containing the path to the yaml file
                        containing the transformations to use and which
                        data in the .csv file to perform it on
    classifier_config_path : str
                             string containing the path to the yaml file
                             representing the classifier hyperparameters
    iterator_mode : str
                    option to use images from arrays directly or save the
                    .png and use a directory iterator mode
    plot_format : str
                  option for standard or RGB color gradient
    print_out : bool
                option for printing out feedback on conputational time taken to
                initialize the data and generate the images
    num_test_files_class : int or float
                            numebr of files per class to select for the test
                            set
    classifier : str
                  option cnn or tuner
    scale :  float
             percentage fo the image to reduce its size to.
    split : float
            the percentage of the learning set to use for the validation step
    target_size : tuple
                  image target size. Presented as a tuble indicating number of
                  pixels composing the two dimensions of the image (w x h)
    batch_size : int
                 The number of files to group up into a batch

    classes : list
              A list containing strings of the classes the data is divided in.
              The class name represent the folder name the files are contained
              in.
    project_name : str
                   name of the folder to be created for storing the results of
                   the tuning
    k_fold: bool
            bool value indicating if the k_fold is to be performed. Not valid
            for tuner
    k: int
       integer value indicating how many k folds needs to be performed.
    seed: int
          used in hold_out_test_set to isolate the testing data randomly for
          use in training of neural network. Can be assigned value to repeat
          the selection.


    Returns
    -------

    test_set_filenames: list
                        list indicating the test set data to be used for
                        neural network
    image_data: list
                list of images comprising of file name, image_data, and
                label
    image_path: list
                indicating the images path for image data when plot
                data is used

    """

    '''
    if tform_config_path is None:
        # ALLOWED so we can test functions without Transfoms
        #    If so, create a list of one Tform_config, which will be "None"
        tform_command_list = ["no_transform"]
        tform_command_dict = {"no_transform": None}
    else:
        # Import the Tform Config List (and the dictionary for it)
        tform_command_list, tform_command_dict = \
            arbitrage.import_tform_config(tform_config_path)
        pass
    # ===========================
    # 1b) ANY OTHER SETUP?
    # ===========================

    test_set_filenames = preprocessing.hold_out_test_set(
        raw_datapath,
        number_of_files_per_class=num_test_files_class,
        classes=classes,
        seed=seed)

    # ============================================
    # Section 2: Data Wrapper        (Setup + Run)
    # ============================================
    # tform_commands = tform_command_dict[tform_name]

    data_dict = {}
    partial_data_wrapper = partial(data_wrapper,
                                   raw_datapath=raw_datapath,
                                   plot_format=plot_format,
                                   iterator_mode=iterator_mode,
                                   print_out=print_out,
                                   scale=scale,
                                   project_name=project_name,
                                   classes=classes,
                                   skiprows=skiprows,
                                   tform_command_dict=tform_command_dict)
    # need to pass tform_command_dict to data_wrapper instead of
    # tform_commands,
    # determine tform_commands in the
    # data_wrapper

    # for tform_name in tform_command_list:

    # ============================================
    # Section 2: Data Wrapper        (Setup + Run)
    # ============================================
    #    tform_commands = tform_command_dict[tform_name]
    #    print(tform_commands)
    pool = mp.Pool(processes=mp.cpu_count())

    if iterator_mode == 'arrays':
        image_data = pool.map(partial_data_wrapper, tform_command_list)
        image_path = None
        for i in range(len(tform_command_list)):
            data_dict[tform_command_list[i]] = image_data[i]
    #    data_dict[tform_name] = image_data
    else:
        image_data = None
        image_path = pool.map(partial_data_wrapper, tform_command_list)

    pool.close()

    return test_set_filenames, image_data, image_path
예제 #7
0
def checkrun(
        raw_datapath,
        tform_config_path,
        classifier_config_path,
        # Optional for Data
        iterator_mode='arrays',
        plot_format="RGBrgb",
        print_out=True,
        skiprows=0,
        # Optional for Classifier
        scale=1.0,
        classifier='tuner',
        split=0.1,
        target_size=(80, 80),
        batch_size=32,
        classes=['class_1', 'class_2'],
        project_name='tuner_run',
        k_fold=False,
        k=None,
        color_mode='rgb',
        seed=None):
    ''' Check run function that when executed runs all the transformations over
    1% of the data and returns the time and memory required for the complete
    preprocessing of data step.

    Parameters
    ----------
    raw_datapath : str
                   path to the raw .csv files containing the data to classify
    tform_config_path : str
                        string containing the path to the yaml file
                        containing the transformations to use and which
                        data in the .csv file to perform it on
    classifier_config_path : str
                             string containing the path to the yaml file
                             representing the classifier hyperparameters
    iterator_mode : str
                    option to use images from arrays directly or save the
                    .png and use a directory iterator mode
    plot_format : str
                  option for standard or RGB color gradient
    print_out : bool
                option for printing out feedback on conputational time taken to
                initialize the data and generate the images
    num_test_files_class : int or float
                            numebr of files per class to select for the test
                            set
    classifier : str
                  option cnn or tuner
    scale :  float
             percentage fo the image to reduce its size to.
    split : float
            the percentage of the learning set to use for the validation step
    target_size : tuple
                  image target size. Presented as a tuble indicating number of
                  pixels composing the two dimensions of the image (w x h)
    batch_size : int
                 The number of files to group up into a batch

    classes : list
              A list containing strings of the classes the data is divided in.
              The class name represent the folder name the files are contained
              in.
    project_name : str
                   name of the folder to be created for storing the results of
                   the tuning
    k_fold: bool
            bool value indicating if the k_fold is to be performed. Not valid
            for tuner
    k: int
       integer value indicating how many k folds needs to be performed.
    seed: int
          used in hold_out_test_set to isolate the testing data randomly for
          use in training of neural network. Can be assigned value to repeat
          the selection.

    '''
    # listing down the filenames
    file_names = [
        item for item in os.listdir(raw_datapath) if item.endswith('.csv')
    ]

    # calculations for extracting 1% of data
    range_filename = int(len(file_names) * 0.01)
    # check to pass test or when the dataset is very small
    if range_filename == 0:
        range_filename += 1

    range_classes = len(classes)

    # calculating files per class to pass to hold_out_test_set
    # for getting filesnames to be transferring to temporary
    # folder
    file_per_class = round(range_filename / range_classes)
    # check to pass test or when the dataset is very small
    if file_per_class == 0:
        file_per_class += 1

    # calculating the test_set_filenames to be used for
    # the hold_out_test_set in the checkpoint_datacreation
    # function
    num_test_files_class = round(0.25 * file_per_class)

    # start measuring time here
    time_1 = perf_counter()

    # starting memory tracer
    tracemalloc.start()

    # getting filenames to collect files for checkrun
    file_names_for_test = preprocessing.hold_out_test_set(
        raw_datapath,
        number_of_files_per_class=file_per_class,
        classes=classes)

    # addding .csv extension to the filenames and storing it in
    # seperate list
    file_names_csv = []
    for item in file_names_for_test:
        file_names_csv.append(item + '.csv')

    # making directory to store the temporary data
    os.mkdir(os.path.join(raw_datapath, 'temp/'))

    # copying the 1% data to seperate folder
    for item in file_names_csv:
        shutil.copy(os.path.join(raw_datapath, item),
                    os.path.join(raw_datapath + 'temp/'))

    # creating new data path to be passed to checkpoint_
    # datacreation function
    new_data_path = os.path.join(raw_datapath, 'temp/')

    checkpoint_datacreation(
        new_data_path,
        tform_config_path,
        classifier_config_path,
        # Optional for Data
        iterator_mode=iterator_mode,
        plot_format=plot_format,
        print_out=print_out,
        skiprows=skiprows,
        # Optional for Classifier
        num_test_files_class=num_test_files_class,
        scale=scale,
        classifier=classifier,
        split=split,
        target_size=target_size,
        batch_size=batch_size,
        classes=classes,
        project_name=project_name,
        k_fold=k_fold,
        k=k,
        color_mode=color_mode,
        seed=seed)

    # getting feedback from memory tracer
    current, peak = tracemalloc.get_traced_memory()

    # stopping memory tracer
    tracemalloc.stop()
    time_2 = perf_counter()
    time_elapsed = time_2 - time_1

    print("The total time required for data creation will be approx.\
         {} hours".format(round((time_elapsed * 100) / 3600, 3)))

    print("The total memory required for the process will be approx.\
         {} Gigabytes".format(round(peak * 100 / (10**9), 3)))

    # removing the temporary data folder
    shutil.rmtree(new_data_path)

    print("Temporary files created by process are successfully deleted")
예제 #8
0
def hardy_main(  # Data and Config Paths
        raw_datapath,
        tform_config_path,
        classifier_config_path,
        # Optional for Data
        iterator_mode='arrays',
        plot_format="RGBrgb",
        print_out=True,
        skiprows=0,
        # Optional for Classifier
        num_test_files_class=300,
        scale=1.0,
        classifier='tuner',
        split=0.1,
        target_size=(80, 80),
        batch_size=32,
        classes=['class_1', 'class_2'],
        project_name='tuner_run',
        k_fold=False,
        k=None,
        color_mode='rgb',
        seed=None,
        n_threads=1):
    """
    OVERALL wrapper function, to pass initial configurations and allow
        all other internal functions to understand and call upon each other.

    Parameters
    ----------
    raw_datapath : str
                   path to the raw .csv files containing the data to classify
    tform_config_path : str
                        string containing the path to the yaml file
                        containing the transformations to use and which
                        data in the .csv file to perform it on
    classifier_config_path : str
                             string containing the path to the yaml file
                             representing the classifier hyperparameters
    iterator_mode : str
                    option to use images from arrays directly or save the
                    .png and use a directory iterator mode
    plot_format : str
                  option for standard or RGB color gradient
    print_out : bool
                option for printing out feedback on conputational time taken to
                initialize the data and generate the images
    num_test_files_class : int or float
                            numebr of files per class to select for the test
                            set
    classifier : str
                  option cnn or tuner
    scale :  float
             percentage fo the image to reduce its size to.
    split : float
            the percentage of the learning set to use for the validation step
    target_size : tuple
                  image target size. Presented as a tuble indicating number of
                  pixels composing the two dimensions of the image (w x h)
    batch_size : int
                 The number of files to group up into a batch

    classes : list
              A list containing strings of the classes the data is divided in.
              The class name represent the folder name the files are contained
              in.
    project_name : str
                   name of the folder to be created for storing the results of
                   the tuning
    k_fold: bool
            bool value indicating if the k_fold is to be performed. Not valid
            for tuner
    k: int
       integer value indicating how many k folds needs to be performed.
    seed: int
          used in hold_out_test_set to isolate the testing data randomly for
          use in training of neural network. Can be assigned value to repeat
          the selection.
    n_threads: int
               Number of cores used for parallel processing during the data
               transformation stage.

    Function Calls  (see their related documentation)
    --------------
    import_tform_config :   f(n) of ARBITRAGE.py
                            Import the list and dictionary of transforms
                            to be looped through.
                            (full model and report for each)

    data_wrapper :  Local Wrapping f(n)
                    Takes file path and the transformation command
                    for the current loop, and creates the list-of-tuple
                    images (OR Saves Image Files to be used later)

    classifier_wrapper :    Local Wrapping f(n)
                            Takes many inputs including configuration
                            loading directions. Loads images, and
                            makes them Keras-Readable.
                            Then sets up the model and the tuner,
                            and runs the model test/train/tune loops
                            as commanded.

    Returns
    -------
    * returns a folder containing subfolders, one for each trial run in the
    hardy module. THese folders will containg a report on the run, as well as
    the best classifier model

    """
    # ================================================
    # Section 1: Setup and Import Transforms
    # ================================================
    if tform_config_path is None:
        # ALLOWED so we can test functions without Transfoms
        #    If so, create a list of one Tform_config, which will be "None"
        tform_command_list = ["no_transform"]
        tform_command_dict = {"no_transform": None}
    else:
        # Import the Tform Config List (and the dictionary for it)
        tform_command_list, tform_command_dict = \
            arbitrage.import_tform_config(tform_config_path)
        pass
    # ===========================
    # 1b) ANY OTHER SETUP?
    # ===========================

    test_set_filenames = preprocessing.hold_out_test_set(
        raw_datapath,
        number_of_files_per_class=num_test_files_class,
        classes=classes,
        seed=seed)

    # # Make the raw Dataframe Tuples List
    # raw_tuples_list = to_catalogue._data_tuples_from_fnames(
    #     raw_datapath, classes=classes, skiprows=skiprows)

    # data_dict = {}
    partial_data_wrapper = partial(data_wrapper,
                                   raw_datapath=raw_datapath,
                                   plot_format=plot_format,
                                   iterator_mode=iterator_mode,
                                   print_out=print_out,
                                   scale=scale,
                                   project_name=project_name,
                                   classes=classes,
                                   skiprows=skiprows,
                                   tform_command_dict=tform_command_dict)
    # need to pass tform_command_dict to data_wrapper instead of
    # tform_commands,
    # determine tform_commands in the
    # data_wrapper

    # for tform_name in tform_command_list:

    # ============================================
    # Section 2: Data Wrapper        (Setup + Run)
    # ============================================
    #    tform_commands = tform_command_dict[tform_name]
    #    print(tform_commands)
    pool = mp.Pool(processes=n_threads)

    if iterator_mode == 'arrays':
        pool.map(partial_data_wrapper, tform_command_list)
        image_path = None

    #    for i in range(len(tform_command_list)):
    #        data_dict[tform_command_list[i]] = image_data[i]
    #    data_dict[tform_name] = image_data
    else:
        # image_data = None
        image_path = pool.map(partial_data_wrapper, tform_command_list)

    pool.close()

    # ============================================
    # Section 3: Classifier Wrapper  (Setup + Run)
    # ============================================
    for tform_name in tform_command_list:

        # Image PATH is none, but we can pass DATA
        classifier_wrapper(raw_datapath,
                           test_set_filenames,
                           tform_name,
                           classifier_config_path,
                           classifier=classifier,
                           iterator_mode=iterator_mode,
                           split=split,
                           color_mode=color_mode,
                           target_size=target_size,
                           batch_size=batch_size,
                           image_path=image_path,
                           classes=classes,
                           project_name=project_name,
                           k_fold=k_fold,
                           k=k)
        # NO OUTPUT? - it outputs the report file

    return None