Exemplo n.º 1
0
def LoadDataset():
    ml = DataImporter()
    print("Loading movie ratings...")
    data = ml.loadDatasetFromFile()
    print("\nComputing movie popularity ranks")
    rankings = ml.getPopularityRanks()
    return (ml, data, rankings)
Exemplo n.º 2
0
    def get_data_sets(import_config):
        di = DataImporter()

        # standardized_photo_path = di.construct_standardized_photo_path(import_config["data_import"]["standardized_photos"], import_config["data_import"]["output_image_dimensions"])

        data_set_dictionary = di.import_all_data(
            import_config["data_import"]["data_description_file_path"],
            os.path.join(
                import_config["data_import"]["standardized_photos"],
                'output_image_dimensions@' + json.dumps(
                    import_config["data_import"]["output_image_dimensions"])))

        target_config = import_config["learning"]["target"]

        data_set_dictionary["output_type"] = target_config["output_type"]

        if isinstance(target_config["value_names"], list):
            data_set_dictionary["value_names"] = target_config["value_names"]
        else:

            data_set_dictionary["value_names"] = dict(
                zip(
                    data_set_dictionary["training"][1][
                        target_config["column_header"]],
                    data_set_dictionary["meta"]["training"][
                        target_config["value_names"]]))

        data_set_dictionary["training"] = (
            data_set_dictionary["training"][0],
            np.array(data_set_dictionary["training"][1][
                target_config["column_header"]]))
        data_set_dictionary["validation"] = (
            data_set_dictionary["validation"][0],
            np.array(data_set_dictionary["validation"][1][
                target_config["column_header"]]))
        data_set_dictionary["test"] = (data_set_dictionary["test"][0],
                                       np.array(data_set_dictionary["test"][1][
                                           target_config["column_header"]]))

        if target_config["output_type"] == 'bool':
            data_set_dictionary["to_network"] = lambda x: np.reshape(
                x, [-1, 1])
            data_set_dictionary["from_network"] = lambda x: np.reshape(x, [-1])

        if target_config["output_type"] == 'float':
            scale = np.max(data_set_dictionary["training"][1])
            data_set_dictionary["to_network"] = lambda x: np.reshape(
                x, [-1, 1]) / scale
            data_set_dictionary["from_network"] = lambda x: np.reshape(
                x, [-1]) * scale

        if target_config["output_type"] == 'categorical_int':
            scale = np.max(data_set_dictionary["training"][1])
            data_set_dictionary["to_network"] = lambda x: to_categorical(x)
            data_set_dictionary["from_network"] = lambda x: np.argmax(x,
                                                                      axis=-1)

        return data_set_dictionary
Exemplo n.º 3
0
def main():
    # Creation of the root frame of the application
    root = Tk()
    root.title("ML APP")
    root.resizable(width=False, height=False)

    # Call of the first step window (Data importer)
    step1 = DataImporter(root)
    step1.pack_frame()

    root.mainloop()
Exemplo n.º 4
0
    def setUp(self):

        self.photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'photos')
        self.unstandardized_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'unstandardized_photos')
        self.unstandardized_live_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'unstandardized_live_photos')

        self.edge_case_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'edgeCasePhotos')
        self.fixed_size_edge_case_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'fixed_size_edge_case_photos')

        self.fixed_size_images_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'fixed_size_images')

        self.temp_reduced_edge_case_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'temp_reduced')

        self.test_photos_csv_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'test_photos.csv')
        self.test_live_photos_csv_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'test_live_photos.csv')
        self.reduced_test_photos_csv_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'reduced_test_photos.csv')

        self.dataImporter = DataImporter()

        self.test_photos_description = pd.read_csv(
            self.test_photos_csv_file_path)

        self.reduced_test_photos_description = pd.read_csv(
            self.reduced_test_photos_csv_file_path)

        self.cache_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'cache.dill')
Exemplo n.º 5
0
    def setUpClass(cls):

        cls.unstandardized_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources', 'runner',
            'unstandardized_photos')
        cls.photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources', 'runner',
            'photos')
        cls.test_photos_csv_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources', 'runner',
            'test_photos.csv')

        cls.dataImporter = DataImporter()

        cls.dataImporter.convert_to_standard_resolution(
            cls.unstandardized_photos_path, cls.photos_path, [152, 114], {
                "multi_core_count": 2,
                "timeout_secs": 100,
                "chunk_size": 2
            })

        cls.data_set = cls.dataImporter.import_all_data(
            cls.test_photos_csv_file_path, cls.photos_path)

        data_set = test_Runner.dataImporter.import_all_data(
            cls.test_photos_csv_file_path, cls.photos_path)

        cls.mock_results = Learning.simple_binary_classification(data_set,
                                                                 epochs=1,
                                                                 batch_size=2)
Exemplo n.º 6
0
    def setUp(self):
        self.unstandardized_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'simple_learning', 'unstandardized_photos')
        self.photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'simple_learning', 'photos')
        self.test_photos_csv_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'simple_learning', 'test_photos.csv')

        self.dataImporter = DataImporter()

        self.dataImporter.convert_to_standard_resolution(
            self.unstandardized_photos_path, self.photos_path, [152, 114], {
                "multi_core_count": 2,
                "timeout_secs": 100,
                "chunk_size": 2
            })
Exemplo n.º 7
0
def run(argv):

    parameter_space_file_path = argv[1]

    parameter_space = ParameterSpace(
        ParameterSpace.load(parameter_space_file_path).config["data_import"])

    configurations = parameter_space.get_configuration_grid()

    for config in configurations:
        di = DataImporter()
        start = time.time()

        sink_path = os.path.join(config["standardized_photos"],
                                 config["configuration_name"])

        di.convert_to_standard_resolution(config["unstandardized_photos"],
                                          sink_path,
                                          config["output_image_dimensions"],
                                          config["loading_config"])
        after_convert_t = time.time()
        print('standardizing ')
        print(after_convert_t - start)
Exemplo n.º 8
0
class CreateDashboard():
    def __init__(self):
        self.data_import = DataImporter()
        self.plots = Plots()
        self.maps = Maps()
        self.state_list = None

    def run(self):
        self.data_preparation()
        self.create_plots()
        self.create_maps()
        self.create_html()
        print(self.data_import.recent_date)

    def data_preparation(self):

        self.data_import = self.data_import.getImportedData()

    def create_plots(self):
        self.plots = Plots(self.data_import.df_covid19['state'])
        # self.plots.time_series_plot(self.state_list)
        self.plots.time_series_plot(self.data_import.state_list)

    def create_maps(self):
        self.maps = Maps(data_state=self.data_import.df_state_coor,
                         data_geo=self.data_import.state_geo)

        # Layer 1: create a choropleth map, colorcoded by number of COVID-19 cases currently
        self.maps.create_choropleth(metric='cases')

        # Layer 2: create markers for each state
        ## Convert the Bokeh figures to html
        self.maps.convert_to_html(self.plots.plots_dict)

        ## Create the markers
        self.maps.create_marker(self.plots.plots_dict)

        self.maps.add_layer_control()
        self.maps.save_dashboard()
Exemplo n.º 9
0
class test_Analysis(unittest.TestCase):
    def setUp(self):
        self.result_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'analysis', 'result')

        self.unstandardized_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'analysis', 'unstandardized_photos')
        self.photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'analysis', 'photos')
        self.test_photos_csv_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'analysis', 'test_photos.csv')

        self.dataImporter = DataImporter()

        self.dataImporter.convert_to_standard_resolution(
            self.unstandardized_photos_path, self.photos_path, [152, 114], {
                "multi_core_count": 2,
                "timeout_secs": 100,
                "chunk_size": 2
            })

    def tearDown(self):
        shutil.rmtree(self.result_path)

    def test_simple_binary_classification(self):

        data_set = self.dataImporter.import_all_data(
            self.test_photos_csv_file_path, self.photos_path)

        results = Learning.simple_binary_classification(data_set,
                                                        epochs=1,
                                                        batch_size=2)

        Analysis.store_raw_result(self.result_path, results)
        Analysis.process_result(self.result_path)
        pass

    def test_simple_categorical_classification(self):

        data_set = self.dataImporter.import_all_data(
            self.test_photos_csv_file_path, self.photos_path)

        results = Learning.simple_categorical_classification(data_set,
                                                             epochs=1,
                                                             batch_size=2)

        Analysis.store_raw_result(self.result_path, results)
        Analysis.process_result(self.result_path)
        pass

    def test_simple_crow_score_regression(self):

        data_set = self.dataImporter.import_all_data(
            self.test_photos_csv_file_path, self.photos_path)

        results = Learning.simple_crow_score_regression(data_set,
                                                        epochs=1,
                                                        batch_size=2)

        Analysis.store_raw_result(self.result_path, results)
        Analysis.process_result(self.result_path)
Exemplo n.º 10
0
from DataImporter import DataImporter

di = DataImporter()
result = di.getDataMatrix('Cincinnati', 2017)
print(di.getTeamEstimate('Cincinnati', 2017))
Exemplo n.º 11
0
from Configuration import Configuration
from DataImporter import DataImporter
from SimpleLearning import SimpleLearning
from Analysis import Analysis
import os
import os.path

config = Configuration.load()

description = config.parseArguments()

print("ARGUMENTS PARSED: " + description)

config = config.config

di = DataImporter()

data_set_dictionary = di.import_all_data(
    config["data_import"]["data_description_file_path"],
    config["data_import"]["standardized_photos"])

print("DATA IMPORTED")

binary_results = SimpleLearning.simple_binary_classification(
    data_set_dictionary,
    epochs=config["learning"]["epochs"],
    batch_size=config["learning"]["batch_size"])
print("BINARY")
print("binary_results")
print(binary_results)
binary_result_path = os.path.join(config["analysis"]["result_base_path"],
Exemplo n.º 12
0
class TestDataImporter(unittest.TestCase):
    def setUp(self):

        self.photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'photos')
        self.unstandardized_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'unstandardized_photos')
        self.unstandardized_live_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'unstandardized_live_photos')

        self.edge_case_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'edgeCasePhotos')
        self.fixed_size_edge_case_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'fixed_size_edge_case_photos')

        self.fixed_size_images_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'fixed_size_images')

        self.temp_reduced_edge_case_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'temp_reduced')

        self.test_photos_csv_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'test_photos.csv')
        self.test_live_photos_csv_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'test_live_photos.csv')
        self.reduced_test_photos_csv_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'reduced_test_photos.csv')

        self.dataImporter = DataImporter()

        self.test_photos_description = pd.read_csv(
            self.test_photos_csv_file_path)

        self.reduced_test_photos_description = pd.read_csv(
            self.reduced_test_photos_csv_file_path)

        self.cache_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'dataImport', 'cache.dill')

    def tearDown(self):

        self.config = None
        self.dataImporter = None

        if os.path.isdir(self.temp_reduced_edge_case_photos_path):
            shutil.rmtree(self.temp_reduced_edge_case_photos_path)

        if os.path.isdir(self.photos_path):
            shutil.rmtree(self.photos_path)

        if os.path.isfile(self.cache_file_path):
            os.remove(self.cache_file_path)

    # live data
    def test_live_data(self):
        self.dataImporter.convert_to_standard_resolution(
            self.unstandardized_live_photos_path, self.photos_path, [152, 114],
            {
                "multi_core_count": 2,
                "timeout_secs": 100,
                "chunk_size": 2
            })

        data_set_dictionary = self.dataImporter.load_from_cache(
            self.cache_file_path, self.test_live_photos_csv_file_path,
            self.photos_path)

        self.assertTrue(data_set_dictionary is not None)

    # file management
    def test_load_from_cache(self):
        self.dataImporter.convert_to_standard_resolution(
            self.unstandardized_photos_path, self.photos_path, [152, 114], {
                "multi_core_count": 2,
                "timeout_secs": 100,
                "chunk_size": 2
            })

        self.assertTrue(not os.path.isfile(self.cache_file_path))

        data_set_dictionary = self.dataImporter.load_from_cache(
            self.cache_file_path, self.test_photos_csv_file_path,
            self.photos_path)

        first_crow_labels = np.array(
            data_set_dictionary["training"][1].label_crow_score_int)

        self.assertTrue(os.path.isfile(self.cache_file_path))

        modification_time = os.path.getmtime(self.cache_file_path)
        creation_time = os.path.getctime(self.cache_file_path)

        data_set_dictionary2 = self.dataImporter.load_from_cache(
            self.cache_file_path, self.test_photos_csv_file_path,
            self.photos_path)

        second_crow_labels = np.array(
            data_set_dictionary2["training"][1].label_crow_score_int)

        self.assertTrue(np.all(first_crow_labels == second_crow_labels))

        self.assertTrue(
            modification_time == os.path.getmtime(self.cache_file_path))
        self.assertTrue(
            creation_time == os.path.getctime(self.cache_file_path))

    # data splitting

    def test_import_all_data(self):

        self.dataImporter.convert_to_standard_resolution(
            self.unstandardized_photos_path, self.photos_path, [152, 114], {
                "multi_core_count": 2,
                "timeout_secs": 100,
                "chunk_size": 2
            })

        data_set_dictionary = self.dataImporter.import_all_data(
            self.test_photos_csv_file_path, self.photos_path)

        self.assertTrue(
            np.all(data_set_dictionary["training"][0].shape == (8, 152, 114,
                                                                3)))
        self.assertTrue(
            np.all(data_set_dictionary["validation"][0].shape == (4, 152, 114,
                                                                  3)))
        self.assertTrue(
            np.all(data_set_dictionary["test"][0].shape == (4, 152, 114, 3)))

        #  all data must be uniquely assigned to either training validation or test
        self.assertTrue(
            np.unique(
                np.concatenate([
                    np.array(data_set_dictionary["test"]
                             [1].label_crow_score_int),
                    np.array(data_set_dictionary["validation"]
                             [1].label_crow_score_int),
                    np.array(data_set_dictionary["training"]
                             [1].label_crow_score_int)
                ])).shape[0] == 16)
        self.assertTrue(
            len(
                set(
                    list(data_set_dictionary["meta"]
                         ["test"].label_crow_score_str) +
                    list(data_set_dictionary["meta"]
                         ["validation"].label_crow_score_str) +
                    list(data_set_dictionary["meta"]
                         ["training"].label_crow_score_str))) == 16)

    def test_split_data_set(self):

        label_type_int_labels = np.tile(
            np.array([0, 1, 2, 2, 3, 3, 3, 3]).repeat(5), [4])

        labels_df = pd.DataFrame({
            'label_type_int':
            label_type_int_labels,
            'label_id':
            np.arange(label_type_int_labels.shape[0])
        })

        mock_image_tensors = np.random.random(
            [label_type_int_labels.shape[0], 4, 4, 3])

        data_set = (mock_image_tensors, labels_df)

        split_data_set, split_indices = self.dataImporter.split_data_set(
            data_set, {
                "training": .5,
                "validation": .25,
                "test": .25
            })

        self.assertTrue(
            np.sum(split_data_set["training"][1].label_type_int == 0) == (
                np.sum(label_type_int_labels == 1) / 2))

        self.assertTrue(
            np.sum(split_data_set["validation"][1].label_type_int == 2) == (
                np.sum(label_type_int_labels == 2) / 4))

        self.assertTrue(
            np.sum(split_data_set["test"][1].label_type_int == 3) == (
                np.sum(label_type_int_labels == 3) / 4))

        #  split indices are all disjoint subsets of the original index set
        self.assertTrue(
            np.unique(
                np.concatenate([
                    split_indices["training"], split_indices["validation"],
                    split_indices["test"]
                ])).shape[0] == label_type_int_labels.shape[0])

        #  images are indexed correctly
        self.assertTrue(
            np.all(mock_image_tensors[split_indices["training"], :, :, :] ==
                   split_data_set["training"][0]))
        self.assertTrue(
            np.all(mock_image_tensors[split_indices["validation"], :, :, :] ==
                   split_data_set["validation"][0]))
        self.assertTrue(
            np.all(mock_image_tensors[split_indices["test"], :, :, :] ==
                   split_data_set["test"][0]))

        self.assertTrue(
            np.all(
                np.array(labels_df.label_id[split_indices["training"]]) ==
                np.array(split_data_set["training"][1].label_id)))
        self.assertTrue(
            np.all(
                np.array(labels_df.label_id[split_indices["validation"]]) ==
                np.array(split_data_set["validation"][1].label_id)))
        self.assertTrue(
            np.all(
                np.array(labels_df.label_id[split_indices["test"]]) ==
                np.array(split_data_set["test"][1].label_id)))

    def test_stratified_split_indices_from(self):

        # three labels, where 1/8 is 0, 1/8 is 1 and 1/4 is 2 and 1/2 is 3
        labels = np.tile(np.array([0, 1, 2, 2, 3, 3, 3, 3]).repeat(5), [4])

        split = self.dataImporter.stratified_split_indices_from(
            labels, {
                "training": .5,
                "validation": .25,
                "test": .25
            })

        training_labels = labels[split["training"]]
        validation_labels = labels[split["validation"]]
        test_labels = labels[split["test"]]

        self.assertTrue(training_labels.shape[0] == (labels.shape[0] / 2))
        self.assertTrue(validation_labels.shape[0] == (labels.shape[0] / 4))
        self.assertTrue(test_labels.shape[0] == (labels.shape[0] / 4))

        self.assertTrue(
            np.sum(training_labels == 0) == (training_labels.shape[0] / 8))
        self.assertTrue(
            np.sum(training_labels == 1) == (training_labels.shape[0] / 8))
        self.assertTrue(
            np.sum(training_labels == 2) == (training_labels.shape[0] / 4))
        self.assertTrue(
            np.sum(training_labels == 3) == (training_labels.shape[0] / 2))

        self.assertTrue(
            np.sum(validation_labels == 0) == (validation_labels.shape[0] / 8))
        self.assertTrue(
            np.sum(validation_labels == 1) == (validation_labels.shape[0] / 8))
        self.assertTrue(
            np.sum(validation_labels == 2) == (validation_labels.shape[0] / 4))
        self.assertTrue(
            np.sum(validation_labels == 3) == (validation_labels.shape[0] / 2))

        self.assertTrue(np.sum(test_labels == 0) == (test_labels.shape[0] / 8))
        self.assertTrue(np.sum(test_labels == 1) == (test_labels.shape[0] / 8))
        self.assertTrue(np.sum(test_labels == 2) == (test_labels.shape[0] / 4))
        self.assertTrue(np.sum(test_labels == 3) == (test_labels.shape[0] / 2))

        #  note that due to the small sample size it might accidentally think there is a spearman correlation
        self.assertTrue(
            scipy.stats.spearmanr(np.arange(training_labels.shape[0]),
                                  training_labels).pvalue > .1)
        self.assertTrue(
            scipy.stats.spearmanr(np.arange(validation_labels.shape[0]),
                                  validation_labels).pvalue > .1)
        self.assertTrue(
            scipy.stats.spearmanr(np.arange(test_labels.shape[0]),
                                  test_labels).pvalue > .1)

    # data loading, saving and parallelization
    def test_load_data_set(self):

        self.dataImporter.convert_to_standard_resolution(
            self.edge_case_photos_path, self.fixed_size_edge_case_photos_path,
            [152, 114], {
                "multi_core_count": 2,
                "timeout_secs": 100,
                "chunk_size": 2
            })

        images, labels, meta = self.dataImporter.load_data_set(
            self.reduced_test_photos_csv_file_path,
            self.fixed_size_edge_case_photos_path)

        self.assertTrue(
            np.all(meta.filename == [
                'high.jpg', 'wide.jpg', 'bigSquare.jpg', 'smallSquare.jpg',
                'exact.jpg'
            ]))
        self.assertTrue(
            np.all(meta.label_crow_score_str == ['A', 'B', 'C', 'D', 'E']))
        self.assertTrue(
            np.all(meta.label_type_str ==
                   ['gemengd', 'plastic', 'gemengd', 'gemengd', 'gemengd']))

        self.assertTrue(np.all(labels.label_type_int == [7, 1, 7, 7, 7]),
                        'label type should be loaded correctly')
        self.assertTrue(np.all(labels.label_crow_score_int == [0, 1, 2, 3, 5]),
                        'label crow score should be loaded correctly')
        self.assertTrue(np.all(labels.label_clean_int == [0, 1, 0, 1, 0]),
                        'label clean int should be loaded correctly')
        self.assertTrue(images.shape == (5, 152, 114, 3))

    def test_convert_to_standard_resolution(self):
        self.dataImporter.convert_to_standard_resolution(
            self.edge_case_photos_path,
            self.temp_reduced_edge_case_photos_path, [152, 114], {
                "multi_core_count": 2,
                "timeout_secs": 100,
                "chunk_size": 2
            })

        source_path = self.temp_reduced_edge_case_photos_path
        target_images = list(
            filter(
                lambda name: name.endswith('.jpg') and
                (not name.startswith('.')), os.listdir(source_path)))

        source_image_file_paths = list(
            map(lambda file_name: os.path.join(source_path, file_name),
                target_images))

        image_arrays, is_successful = self.dataImporter.load_image_tensors(
            source_image_file_paths)

        self.assertTrue(image_arrays.shape == (5, 152, 114, 3))

    def test_parallel_load_image_tensor(self):
        source_path = self.edge_case_photos_path
        target_images = list(
            filter(
                lambda name: name.endswith('.jpg') and
                (not name.startswith('.')), os.listdir(source_path)))

        source_image_file_paths = list(
            map(lambda file_name: os.path.join(source_path, file_name),
                target_images))

        image_arrays, is_successful = \
            self.dataImporter.parallel_load_image_tensor(
                source_image_file_paths,
                [152, 114],
                {
                    "multi_core_count": 2,
                    "timeout_secs": 100,
                    "chunk_size": 2
                }
            )

        self.assertTrue(image_arrays.shape == (5, 152, 114, 3))

    def test_load_image_tensors(self):
        source_path = self.fixed_size_images_path
        target_images = list(
            filter(
                lambda name: name.endswith('.jpg') and
                (not name.startswith('.')), os.listdir(source_path)))

        source_image_file_paths = map(
            lambda file_name: os.path.join(source_path, file_name),
            target_images)

        image_arrays, is_successful = self.dataImporter.load_image_tensors(
            source_image_file_paths)

        self.assertTrue(image_arrays.shape == (3, 152, 114, 3))

    # image processing

    def test_load_and_standardize_image(self):
        big_standard_resolution_image, is_successful = self.dataImporter.load_and_standardize_image(
            os.path.join(self.edge_case_photos_path, 'bigSquare.jpg'),
            [114, 152])

        self.assertTrue(
            big_standard_resolution_image.shape == (1, 114, 152, 3),
            'files must be loaded and standardized correctly')
        self.assertTrue(is_successful)

        missing_image, is_successful = self.dataImporter.load_and_standardize_image(
            os.path.join(self.edge_case_photos_path, 'asdf.jpg'), [114, 152])

        self.assertTrue(missing_image is None,
                        'files must be loaded and standardized correctly')
        self.assertTrue(not is_successful)

    def test_standardize_image_ratio(self):

        target_ratio = 114.0 / 152.0

        big_image = np.array(
            load_img(os.path.join(self.edge_case_photos_path,
                                  'bigSquare.jpg')))
        small_image = np.array(
            load_img(
                os.path.join(self.edge_case_photos_path, 'smallSquare.jpg')))
        wide_image = np.array(
            load_img(os.path.join(self.edge_case_photos_path, 'wide.jpg')))
        high_image = np.array(
            load_img(os.path.join(self.edge_case_photos_path, 'high.jpg')))
        exact_image = np.array(
            load_img(os.path.join(self.edge_case_photos_path, 'exact.jpg')))

        big_standard_ratio_image = self.dataImporter.standardize_image_ratio(
            big_image, target_ratio)
        small_standard_ratio_image = self.dataImporter.standardize_image_ratio(
            small_image, target_ratio)
        wide_standard_ratio_image = self.dataImporter.standardize_image_ratio(
            wide_image, target_ratio)
        high_standard_ratio_image = self.dataImporter.standardize_image_ratio(
            high_image, target_ratio)
        exact_standard_ratio_image = self.dataImporter.standardize_image_ratio(
            exact_image, target_ratio)

        big_image_standard_ratio = big_standard_ratio_image.shape[
            1] / big_standard_ratio_image.shape[0]
        small_image_standard_ratio = small_standard_ratio_image.shape[
            1] / small_standard_ratio_image.shape[0]
        wide_image_standard_ratio = wide_standard_ratio_image.shape[
            1] / wide_standard_ratio_image.shape[0]
        high_image_standard_ratio = high_standard_ratio_image.shape[
            1] / high_standard_ratio_image.shape[0]
        exact_image_standard_ratio = exact_standard_ratio_image.shape[
            1] / exact_standard_ratio_image.shape[0]

        self.assertTrue(
            abs(big_image_standard_ratio - target_ratio) < .05,
            'edge case ratio must be close to target ratio')
        self.assertTrue(
            abs(small_image_standard_ratio - target_ratio) < .1,
            'edge case ratio must be close to target ratio')
        self.assertTrue(
            abs(wide_image_standard_ratio - target_ratio) < .1,
            'edge case ratio must be close to target ratio')
        self.assertTrue(
            abs(high_image_standard_ratio - target_ratio) < .1,
            'edge case ratio must be close to target ratio')
        self.assertTrue(
            abs(exact_image_standard_ratio - target_ratio) <= .0,
            'edge case ratio must be close to target ratio')

    def test_standardize_resolution(self):

        target_ratio = 114.0 / 152.0

        big_image = np.array(
            load_img(os.path.join(self.edge_case_photos_path,
                                  'bigSquare.jpg')))
        small_image = np.array(
            load_img(
                os.path.join(self.edge_case_photos_path, 'smallSquare.jpg')))
        wide_image = np.array(
            load_img(os.path.join(self.edge_case_photos_path, 'wide.jpg')))
        high_image = np.array(
            load_img(os.path.join(self.edge_case_photos_path, 'high.jpg')))
        exact_image = np.array(
            load_img(os.path.join(self.edge_case_photos_path, 'exact.jpg')))

        big_standard_resolution_image = self.dataImporter.standardize_resolution(
            self.dataImporter.standardize_image_ratio(big_image, target_ratio),
            [114, 152])
        small_standard_resolution_image = self.dataImporter.standardize_resolution(
            self.dataImporter.standardize_image_ratio(small_image,
                                                      target_ratio),
            [114, 152])
        wide_standard_resolution_image = self.dataImporter.standardize_resolution(
            self.dataImporter.standardize_image_ratio(wide_image,
                                                      target_ratio),
            [114, 152])
        high_standard_resolution_image = self.dataImporter.standardize_resolution(
            self.dataImporter.standardize_image_ratio(high_image,
                                                      target_ratio),
            [114, 152])
        exact_standard_resolution_image = self.dataImporter.standardize_resolution(
            self.dataImporter.standardize_image_ratio(exact_image,
                                                      target_ratio),
            [114, 152])

        self.assertTrue(
            big_standard_resolution_image.shape == (1, 114, 152, 3),
            'resolution must be changed correctly')
        self.assertTrue(
            small_standard_resolution_image.shape == (1, 114, 152, 3),
            'resolution must be changed correctly')
        self.assertTrue(
            wide_standard_resolution_image.shape == (1, 114, 152, 3),
            'resolution must be changed correctly')
        self.assertTrue(
            high_standard_resolution_image.shape == (1, 114, 152, 3),
            'resolution must be changed correctly')
        self.assertTrue(
            exact_standard_resolution_image.shape == (1, 114, 152, 3),
            'resolution must be changed correctly')

    def test_normalize_data(self):
        normalized_data, mean, std = self.dataImporter.normalize_data(
            np.random.normal(3, 7, [1000, 100, 200, 3]))

        normalized_data_mean = np.mean(normalized_data)
        normalized_data_std = np.std(normalized_data)

        self.assertTrue(
            np.abs(normalized_data_mean) < .001, 'mean must be normalized')
        self.assertTrue(
            np.abs(normalized_data_std - 1.0) < .001, 'std must be normalized')
        self.assertTrue(np.abs(mean - 3.0) < .001, 'mean must be correct')
        self.assertTrue(np.abs(std - 7) < .01, 'std must be correct')

    def test_normalize_test_images(self):
        normalized_data, mean, std = self.dataImporter.normalize_data(
            np.random.normal(3, 7, [1000, 100, 200, 3]))

        normalized_test_images = self.dataImporter.normalize_test_images(
            np.random.normal(3, 7, [500, 100, 200, 3]), mean, std)

        normalized_test_data_mean = np.mean(normalized_test_images)
        normalized_test_data_std = np.std(normalized_test_images)

        self.assertTrue(
            np.abs(normalized_test_data_mean) < .01, 'mean must be normalized')
        self.assertTrue(
            np.abs(normalized_test_data_std - 1.0) < .01,
            'std must be normalized')

    # util functions

    def test_get_image_file_paths(self):
        series = self.reduced_test_photos_description.label_crow_score_str

        file_paths = self.dataImporter.get_image_file_paths(series, 'x')

        self.assertTrue(
            file_paths == ['x/A', 'x/B', 'x/C', 'x/D', 'x/D', 'x/E'])

        file_paths = self.dataImporter.get_image_file_paths(series, 'y/')

        self.assertTrue(
            file_paths == ['y/A', 'y/B', 'y/C', 'y/D', 'y/D', 'y/E'])
Exemplo n.º 13
0
class test_SimpleLearning(unittest.TestCase):
    def setUp(self):
        self.unstandardized_photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'simple_learning', 'unstandardized_photos')
        self.photos_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'simple_learning', 'photos')
        self.test_photos_csv_file_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'resources',
            'simple_learning', 'test_photos.csv')

        self.dataImporter = DataImporter()

        self.dataImporter.convert_to_standard_resolution(
            self.unstandardized_photos_path, self.photos_path, [152, 114], {
                "multi_core_count": 2,
                "timeout_secs": 100,
                "chunk_size": 2
            })

    def tearDown(self):
        pass

    def test_simple_binary_classification(self):

        data_set = self.dataImporter.import_all_data(
            self.test_photos_csv_file_path, self.photos_path)

        results = SimpleLearning.simple_binary_classification(data_set,
                                                              epochs=1,
                                                              batch_size=2)

        self.assertTrue(type(results["model"]) == Model)

        self.assertTrue(
            np.isfinite(
                results["stats"]["training"]["history"]["val_acc"][-1]))
        self.assertTrue(len(results["stats"]["validation"]["predicted"]) == 4)
        self.assertTrue(len(results["stats"]["test"]["predicted"]) == 4)
        self.assertTrue(len(results["stats"]["validation"]["correct"]) == 4)
        self.assertTrue(len(results["stats"]["test"]["correct"]) == 4)
        self.assertTrue(
            np.all(np.isfinite(results["stats"]["validation"]["predicted"])))
        self.assertTrue(
            np.all(np.isfinite(results["stats"]["test"]["predicted"])))

        # could fail due to chance, just rerun
        self.assertTrue(
            np.max(results["stats"]["validation"]["predicted"]) == 1)

        self.assertTrue(
            np.isfinite(results["stats"]["validation"]["metrics"]["loss"]))
        self.assertTrue(
            np.isfinite(results["stats"]["validation"]["metrics"]["acc"]))
        self.assertTrue(
            np.isfinite(results["stats"]["test"]["metrics"]["loss"]))
        self.assertTrue(np.isfinite(
            results["stats"]["test"]["metrics"]["acc"]))
        self.assertTrue(
            results["stats"]["meta"]["labels"] == ["dirty", "clean"])

    def test_simple_categorical_classification(self):

        data_set = self.dataImporter.import_all_data(
            self.test_photos_csv_file_path, self.photos_path)

        results = SimpleLearning.simple_categorical_classification(
            data_set, epochs=1, batch_size=2)

        self.assertTrue(type(results["model"]) == Model)

        self.assertTrue(
            np.isfinite(
                results["stats"]["training"]["history"]["val_acc"][-1]))
        self.assertTrue(len(results["stats"]["validation"]["predicted"]) == 4)
        self.assertTrue(len(results["stats"]["test"]["predicted"]) == 4)
        self.assertTrue(len(results["stats"]["validation"]["correct"]) == 4)
        self.assertTrue(len(results["stats"]["test"]["correct"]) == 4)
        self.assertTrue(
            np.all(np.isfinite(results["stats"]["validation"]["predicted"])))
        self.assertTrue(
            np.all(np.isfinite(results["stats"]["test"]["predicted"])))

        # could fail due to chance, just rerun
        self.assertTrue(
            np.max(results["stats"]["validation"]["predicted"]) > 1)

        self.assertTrue(
            np.isfinite(results["stats"]["validation"]["metrics"]["loss"]))
        self.assertTrue(
            np.isfinite(results["stats"]["validation"]["metrics"]["acc"]))
        self.assertTrue(
            np.isfinite(results["stats"]["test"]["metrics"]["loss"]))
        self.assertTrue(np.isfinite(
            results["stats"]["test"]["metrics"]["acc"]))

        self.assertTrue(
            results["stats"]["meta"]["labels"] == ["w", "x", "y", "z"])

    def test_simple_crow_score_regression(self):

        data_set = self.dataImporter.import_all_data(
            self.test_photos_csv_file_path, self.photos_path)

        results = SimpleLearning.simple_crow_score_regression(data_set,
                                                              epochs=1,
                                                              batch_size=2)

        self.assertTrue(type(results["model"]) == Model)

        self.assertTrue(
            np.isfinite(results["stats"]["training"]["history"]
                        ["val_mean_squared_error"][-1]))
        self.assertTrue(len(results["stats"]["validation"]["predicted"]) == 4)
        self.assertTrue(len(results["stats"]["test"]["predicted"]) == 4)
        self.assertTrue(len(results["stats"]["validation"]["correct"]) == 4)
        self.assertTrue(len(results["stats"]["test"]["correct"]) == 4)
        self.assertTrue(
            np.all(np.isfinite(results["stats"]["validation"]["predicted"])))
        self.assertTrue(
            np.all(np.isfinite(results["stats"]["test"]["predicted"])))

        self.assertTrue(
            np.max(results["stats"]["validation"]["predicted"]) > 1)
        self.assertTrue(not np.all(
            np.round(results["stats"]["validation"]["predicted"]) ==
            results["stats"]["validation"]["predicted"]))

        self.assertTrue(
            np.isfinite(results["stats"]["validation"]["metrics"]["loss"]))
        self.assertTrue(
            np.isfinite(results["stats"]["validation"]["metrics"]
                        ["mean_squared_error"]))
        self.assertTrue(
            np.isfinite(results["stats"]["test"]["metrics"]["loss"]))
        self.assertTrue(
            np.isfinite(
                results["stats"]["test"]["metrics"]["mean_squared_error"]))

        self.assertTrue(results["stats"]["meta"]["labels"] is None)
Exemplo n.º 14
0
 def __init__(self):
     self.data_import = DataImporter()
     self.plots = Plots()
     self.maps = Maps()
     self.state_list = None
Exemplo n.º 15
0
from Configuration import Configuration
from DataImporter import DataImporter
import time


config = Configuration.load()

description = config.parseArguments()

config = config.config["data_import"]

di = DataImporter()
start = time.time()
di.convert_to_standard_resolution(
    config["unstandardized_photos"],
    config["standardized_photos"],
    config["output_image_dimensions"],
    config["loading_config"]
)
after_convert_t = time.time()
print('standardizing ')
print(after_convert_t - start)
# DataImporter.load_from_cache(
#     config["cache_file_path"],
#     config["data_description_file_path"],
#     config["standardized_photos"]
# )
after_load_t = time.time()
print('normalizing ')
print(after_load_t - after_convert_t)