Пример #1
0
 def test_cv_score_is_more_than_zero_with_APROPAGATION_KMEANS_MINIKMEANS_MEANSHIFT_mutual_info_score_5(
         self):
     # path to diabetes.csv file in project
     path = ".\\..\\datasets\\iris.csv"
     # get df with loader creator
     csv_type = self._loader_creator.create_loader(path, "CSV")
     df = csv_type.get_file_transformed()
     # split df into x and y
     splitter = SplitterReturner()
     x, y = splitter.split_x_y_from_df(df)
     # create a CVScore object with its path and data type
     cv_score = CVScore()
     # create a simple a svc, knn and gnb estimator
     model_1 = self._estimator_creator.create_estimator(
         "AffinityPropagation")
     model_2 = self._estimator_creator.create_estimator("KMeans")
     model_3 = self._estimator_creator.create_estimator("MiniBatchKMeans")
     model_4 = self._estimator_creator.create_estimator("MeanShift")
     estimators = [
         model_1.set_params(random_state=0), model_2, model_3, model_4
     ]
     # get score from a linearSVC estimator with accuracy score and 5folds
     bol_results = []
     for clf in estimators:
         score = cv_score.get_score(x, y, clf, "mutual_info_score", 5)
         print(clf.__class__.__name__, "score is:", score)
         is_greater_than_zero: bool = True if score > 0 else False
         bol_results.append(is_greater_than_zero)
     print(bol_results)
Пример #2
0
 def test_cv_score_is_more_than_zero_with_LSVC_SVC_KNN_GNB_accuracy_5(self):
     # path to diabetes.csv file in project
     path = ".\\..\\datasets\\diabetes.csv"
     # get df with loader creator
     csv_type = self._loader_creator.create_loader(path, "CSV")
     df = csv_type.get_file_transformed()
     # split df into x and y
     splitter = SplitterReturner()
     x, y = splitter.split_x_y_from_df(df)
     # create a CVScore object with its path and data type
     cv_score = CVScore()
     # create a simple a svc, knn and gnb estimator
     model_1 = self._estimator_creator.create_estimator("SVC")
     model_2 = self._estimator_creator.create_estimator(
         "KNeighborsClassifier")
     model_3 = self._estimator_creator.create_estimator("GaussianNB")
     model_4 = self._estimator_creator.create_estimator("LinearSVC")
     estimators = [
         model_1, model_2, model_3,
         model_4.set_params(dual=False)
     ]
     # get score from a linearSVC estimator with accuracy score and 5folds
     bol_results = []
     for clf in estimators:
         score = cv_score.get_score(x, y, clf, "accuracy", 5)
         print(clf.__class__.__name__, "score is:", score)
         is_greater_than_zero: bool = True if score > 0 else False
         bol_results.append(is_greater_than_zero)
     # any will return True if there's any truth value in the iterable.
     print(bol_results)
     answer = all(bol_results)
     # all of this should be true
     self.assertTrue(answer)
Пример #3
0
 def test_cv_score_is_more_than_zero_with_LSVR_SVR_LASSO_SGD_explained_variance_5(
         self):
     # path to diabetes.csv file in project
     path = ".\\..\\datasets\\winequality-white.csv"
     # get df with loader creator
     scsv_type = self._loader_creator.create_loader(path, "SCSV")
     df = scsv_type.get_file_transformed()
     # split df into x and y
     splitter = SplitterReturner()
     x, y = splitter.split_x_y_from_df(df)
     # create a CVScore object with its path and data type
     cv_score = CVScore()
     # create a simple a svc, knn and gnb estimator
     model_1 = self._estimator_creator.create_estimator("LinearSVR")
     model_2 = self._estimator_creator.create_estimator("SVR")
     model_3 = self._estimator_creator.create_estimator("Lasso")
     model_4 = self._estimator_creator.create_estimator("SGDClassifier")
     estimators = [model_1, model_2, model_3, model_4]
     # get score from a linearSVC estimator with accuracy score and 5folds
     bol_results = []
     for clf in estimators:
         score = cv_score.get_score(x, y, clf, "explained_variance", 5)
         print(clf.__class__.__name__, "score is:", score)
         is_greater_than_zero: bool = True if score > 0 else False
         bol_results.append(is_greater_than_zero)
     print(bol_results)
     # there is at least one true element, which means on of the scores is greater than 0
     self.assertTrue(any(bol_results))
Пример #4
0
 def test_split_into_x_and_y_is_not_a_valid_dataframe(self):
     # dummy dictionary
     temp_dict = {'x': [i for i in range(200)]}
     # transform dictionary to dataframe
     df = pd.DataFrame.from_dict(temp_dict)
     # this should raise a TypeError because dataframe doesnt meet column requirements
     with self.assertRaises(TypeError):
         splitter = SplitterReturner()
         _, _ = splitter.split_x_y_from_df(df)
Пример #5
0
 def train_model(self, df: DataFrame, size: float = 0.0) -> None:
     x, y = SplitterReturner.split_x_y_from_df(df)
     if size == 0.0:
         self._auto_ml.fit_model(x, y)
     elif 0.0 < size < 1.0:
         x_train, _, y_train, _ = SplitterReturner.train_and_test_split(
             x, y, size)
         self._auto_ml.fit_model(x_train, y_train)
     else:
         raise ValueError("Size is neither 0.0 nor 0.0 < size < 1.0")
Пример #6
0
 def test_single_split_returns_a_tuple(self):
     # load diabetes.csv from disk
     folder_name = "datasets"
     file_name = "diabetes.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     csv_type = self._loader_creator.create_loader(test_full_path, "CSV")
     df = csv_type.get_file_transformed()
     # use of splitterReturner with a NormalSplitter implementation
     splitter = SplitterReturner()
     # split dataframe into x and y
     data = splitter.split_x_y_from_df(df)
     result = DataEnsurer.validate_py_data(data, tuple)
     self.assertTrue(result)
Пример #7
0
 def test_train_test_split_size_zero_is_wrong(self):
     # load diabetes.csv from disk
     folder_name = "datasets"
     file_name = "diabetes.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     csv_type = self._loader_creator.create_loader(test_full_path, "CSV")
     df = csv_type.get_file_transformed()
     # use of splitterReturner with a NormalSplitter implementation
     with self.assertRaises(ValueError):
         splitter = SplitterReturner()
         # split dataframe into x and y, then use train_and_test_split
         x, y = splitter.split_x_y_from_df(df)
         _ = splitter.train_and_test_split(
             x, y, 0.0
         )  # 80 percent of data should be training and the other 20 is
Пример #8
0
 def test_train_test_split_size_less_than_zero_is_wrong(self):
     # load diabetes.csv from disk
     folder_name = "datasets"
     file_name = "diabetes.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     csv_type = self._loader_creator.create_loader(test_full_path, "CSV")
     df = csv_type.get_file_transformed()
     # this should raise a ValueError because size = -0.5 is not a valid number
     with self.assertRaises(ValueError):
         # use of splitterReturner with a NormalSplitter implementation
         splitter = SplitterReturner()
         # split dataframe into x and y, then use train_and_test_split
         x, y = splitter.split_x_y_from_df(df)
         _ = splitter.train_and_test_split(
             x, y, -0.5)  # -0.5 is not a valid value
Пример #9
0
 def test_single_split_columns_match(self):
     # load diabetes.csv from disk
     folder_name = "datasets"
     file_name = "diabetes.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     csv_type = self._loader_creator.create_loader(test_full_path, "CSV")
     df = csv_type.get_file_transformed()
     expected_y_len, expected_x_len = df.shape  # true prediction and data len with shape method
     # shape returns original column value. x doesn't have prediction column, so it must be original value - 1
     expected_x_len -= 1
     # use of splitterReturner with a NormalSplitter implementation
     splitter = SplitterReturner()
     x, y = splitter.split_x_y_from_df(df)
     # do the values match in both x and y dataframes
     self.assertEqual(len(x.columns), expected_x_len)
     self.assertEqual(len(y), expected_y_len)
Пример #10
0
 def test_wine_quality_has_fewer_features_with_LASSO_FFS_explained_variance_10(
         self):
     # load molecules.csv from disk
     folder_name = "datasets"
     file_name = "winequality-red.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     # get dataframe using LoaderCreator
     scsv_type = self._loader_creator.create_loader(test_full_path, "scsv")
     df = scsv_type.get_file_transformed()
     # get x and y from SplitterReturner
     x, y = SplitterReturner.split_x_y_from_df(df)
     _, len_original_y = x.shape
     # create a feature selector
     fs = self._feature_selector_creator.create_feature_selector("FFS")
     # create a simple Lasso estimator
     clf = self._estimator_creator.create_estimator("Lasso")
     prm = {
         'alpha': 1.0,
         'random_state': 8,
         'selection': 'cyclic',
         'tol': 0.0001
     }
     clf.set_params(**prm)
     # get new_x with new features
     new_x, score = fs.select_features(x, y, clf, "explained_variance", 10)
     print(new_x.columns.values, f"\n{score}")
     _, len_new_y = new_x.shape
     # does it have fewer features?
     is_fewer_than_original: bool = True if len_new_y < len_original_y else False
     print("lasso", is_fewer_than_original)
Пример #11
0
 def test_wine_quality_has_fewer_features_with_LSVR_FFS_r2_10(self):
     # load molecules.csv from disk
     folder_name = "datasets"
     file_name = "winequality-red.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     # get dataframe using LoaderCreator
     scsv_type = self._loader_creator.create_loader(test_full_path, "scsv")
     df = scsv_type.get_file_transformed()
     # get x and y from SplitterReturner
     x, y = SplitterReturner.split_x_y_from_df(df)
     _, len_original_y = x.shape
     # create a feature selector
     fs = self._feature_selector_creator.create_feature_selector("FFS")
     # create a simple LSVR estimator
     clf = self._estimator_creator.create_estimator("LinearSVR")
     clf.set_params(max_iter=20000,
                    dual=False,
                    loss="squared_epsilon_insensitive")
     # get new_x with new features
     new_x, score = fs.select_features(x, y, clf, "r2", 10)
     _, len_new_y = new_x.shape
     # does it have fewer features?
     is_fewer_than_original: bool = True if len_new_y < len_original_y else False
     # this should be True
     self.assertTrue(is_fewer_than_original)
Пример #12
0
 def test_n_folds_validation_and_score_type_raises_ValueError(self):
     # path to diabetes.csv file in project
     path = ".\\..\\datasets\\diabetes.csv"
     # get df with loader creator
     csv_type = self._loader_creator.create_loader(path, "CSV")
     df = csv_type.get_file_transformed()
     # split df into x and y
     splitter = SplitterReturner()
     x, y = splitter.split_x_y_from_df(df)
     # create a CVScore object with its path and data type
     cv_score = CVScore()
     # create a simple linearSVC estimator
     model = self._estimator_creator.create_estimator("LinearSVC")
     model.set_params(dual=False, random_state=0)
     with self.assertRaises(ValueError):
         # get score from a linearSVC estimator with roc_auc score and 10 folds
         _ = cv_score.get_score(x, y, model, "roc", 2)
Пример #13
0
 def test_single_split_x_and_y_is_a_dataframe_and_numpy_array(self):
     # load diabetes.csv from disk
     folder_name = "datasets"
     file_name = "diabetes.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     csv_type = self._loader_creator.create_loader(test_full_path, "CSV")
     df = csv_type.get_file_transformed()
     # use of splitterReturner with a NormalSplitter implementation
     splitter = SplitterReturner()
     # split dataframe into x and y
     data = splitter.split_x_y_from_df(df)
     results = [
         isinstance(data[0], pd.DataFrame),
         isinstance(data[-1], np.ndarray)
     ]
     # are all outputs True?
     for r in results:
         self.assertTrue(r)
Пример #14
0
 def test_wine_quality_LASSO_GS(self):
     # path to diabetes.csv file in project
     path = ".\\..\\datasets\\winequality-white.csv"
     # get df with loader creator
     scsv_type = self._loader_creator.create_loader(path, "SCSV")
     df = scsv_type.get_file_transformed()
     # create a prm variable to store params grid
     initial_prm = GridSearchParametersPossibilities.case("Lasso")
     # create an estimator using EstimatorCreator
     estimator = self._estimator_creator.create_estimator("Lasso")
     # split df into x and y
     splitter = SplitterReturner()
     x, y = splitter.split_x_y_from_df(df)
     # create a ps variable that stores a grid search object
     ps = self._param_search_creator.create_parameter_selector("GS")
     # get best params from ps.search_parameters
     best_prm, _ = ps.search_parameters(x, y, initial_prm, 10, estimator,
                                        "r2")
     print(best_prm)
Пример #15
0
 def test_molecules_SVC_grid_search(self):
     # path to molecules.csv file in project
     path = ".\\..\\datasets\\molecules.csv"
     # get df with loader creator
     csv_type = self._loader_creator.create_loader(path, "TSV")
     df = csv_type.get_file_transformed()
     df = df.drop(["m_name"], axis=1)
     # split df into x and y
     splitter = SplitterReturner()
     x, y = splitter.split_x_y_from_df(df)
     # create a simple SVC estimator
     model = self._estimator_creator.create_estimator("SVC")
     # create a prm variable that stores the param grid to search
     prm = GridSearchParametersPossibilities.case("SVC")
     # create a ps variable that stores a grid search object
     ps = self._param_search_creator.create_parameter_selector("GS")
     # get best params from ps.search_parameters
     best_prm, score = ps.search_parameters(x, y, prm, 10, model,
                                            "accuracy")
     print(best_prm, score)
Пример #16
0
 def score_model(self, score_type: str, n_folds_validation: int) -> float:
     # get x and y from df
     x, y = SplitterReturner.split_x_y_from_df(self.data_frame)
     self.best_parameters = self.initial_parameters  # they are the same in a simple model
     # set clf params. ** because it accepts key-value one by one, not a big dictionary
     self.estimator.set_params(**self.best_parameters)
     self.best_features = x.columns.values  # get features as numpy data
     # return the cv score
     score = self._cv_score.get_score(x, y, clone(self.estimator),
                                      score_type, n_folds_validation)
     self.estimator.fit(x, y)
     return score
Пример #17
0
 def score_model(self, score_type: str, n_folds_validation: int) -> float:
     # get x and y from df
     x, y = SplitterReturner.split_x_y_from_df(self.data_frame)
     # transform initial params grid into a simple dict which is best_params
     self.best_parameters, score = self.parameter_selector.search_parameters(
         x, y, self.initial_parameters, n_folds_validation,
         clone(self.estimator), score_type)
     self.best_features = x.columns.values  # get features as numpy data
     # set clf params from the previous search. ** because it accepts key-value one by one, not a big dictionary
     self.estimator.set_params(**self.best_parameters)
     self.estimator.fit(x, y)
     return score
Пример #18
0
 def score_model(self, score_type: str, n_folds_validation: int) -> float:
     # get x and y from df. Ravel is set to False so we can save the original y with its column
     x, y = SplitterReturner.split_x_y_from_df(self.data_frame,
                                               ravel_data=False)
     self.best_parameters = self.initial_parameters  # they are the same in a only feature selection model
     # set clf params. ** because it accepts key-value one by one, not a big dictionary
     self.estimator.set_params(**self.best_parameters)
     # get best features
     best_features_dataframe, score = self.feature_selector.select_features(
         x, y.values.ravel(), clone(self.estimator), score_type,
         n_folds_validation)
     self.best_features = best_features_dataframe.columns.values  # get features as numpy data
     self.data_frame = pd.concat([best_features_dataframe, y], axis=1)
     self.estimator.fit(best_features_dataframe, y)
     return score
Пример #19
0
 def test_diabetes_lsvc_search_bs(self):
     # path to diabetes.csv file in project
     path = ".\\..\\datasets\\diabetes.csv"
     # get df with loader creator
     csv_type = self._loader_creator.create_loader(path, "CSV")
     df = csv_type.get_file_transformed()
     # split df into x and y
     x, y = SplitterReturner.split_x_y_from_df(df)
     # create a simple linearSVC estimator
     model = self._estimator_creator.create_estimator("LinearSVC")
     # create a prm variable that stores the param grid to search
     prm = BayesianSearchParametersPossibilities.case("LinearSVC")
     # create a ps variable that stores a bayesian search object
     ps = self._param_search_creator.create_parameter_selector("BS")
     # get best params from ps.search_parameters
     best_prm, _ = ps.search_parameters(x, y, prm, 10, model, "accuracy")
     print(best_prm)
Пример #20
0
 def test_wine_quality_with_LSVR_BFS_neg_mean_squared_error_10(self):
     # load molecules.csv from disk
     folder_name = "datasets"
     file_name = "winequality-red.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     # get dataframe using LoaderCreator
     scsv_type = self._loader_creator.create_loader(test_full_path, "scsv")
     df = scsv_type.get_file_transformed()
     # get x and y from SplitterReturner
     x, y = SplitterReturner.split_x_y_from_df(df)
     # create a feature selector
     fs = self._feature_selector_creator.create_feature_selector("BFS")
     # create a simple LSVR estimator
     clf = self._estimator_creator.create_estimator("LinearSVR")
     clf.set_params(max_iter=20000,
                    dual=False,
                    loss="squared_epsilon_insensitive")
     # get new_x with new features
     new_x, score = fs.select_features(x, y, clf, "neg_mean_squared_error",
                                       10)
     print(new_x.columns.values, f"\n{score}")
Пример #21
0
 def test_iris_has_fewer_features_with_KMEANS_FFS_mutual_info_score_5(self):
     # load molecules.csv from disk
     folder_name = "datasets"
     file_name = "iris.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     # get dataframe using LoaderCreator
     csv_type = self._loader_creator.create_loader(test_full_path, "csv")
     df = csv_type.get_file_transformed()
     # get x and y from SplitterReturner
     x, y = SplitterReturner.split_x_y_from_df(df)
     _, len_original_y = x.shape
     # create a feature selector
     fs = self._feature_selector_creator.create_feature_selector("FFS")
     # create a simple Kmeans estimator
     clf = self._estimator_creator.create_estimator("KMeans")
     # get new_x with new features
     new_x, score = fs.select_features(x, y, clf, "mutual_info_score", 5)
     _, len_new_y = new_x.shape
     # does it have fewer features?
     is_fewer_than_original: bool = True if len_new_y < len_original_y else False
     # this should be True
     self.assertTrue(is_fewer_than_original)
Пример #22
0
 def test_diabetes_has_fewer_features_with_SVC_FFS_accuracy_10(self):
     # load molecules.csv from disk
     folder_name = "datasets"
     file_name = "diabetes.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     # get dataframe using LoaderCreator
     csv_type = self._loader_creator.create_loader(test_full_path, "csv")
     df = csv_type.get_file_transformed()
     # get x and y from SplitterReturner
     x, y = SplitterReturner.split_x_y_from_df(df)
     _, len_original_y = x.shape
     # create a feature selector
     fs = self._feature_selector_creator.create_feature_selector("FFS")
     # create a simple SVC estimator
     clf = self._estimator_creator.create_estimator("SVC")
     clf.set_params(random_state=0)
     # get new_x with new features
     new_x, score = fs.select_features(x, y, clf, "accuracy", 10)
     _, len_new_y = new_x.shape
     # does it have fewer features?
     is_fewer_than_original: bool = True if len_new_y < len_original_y else False
     # this should be True
     self.assertTrue(is_fewer_than_original)
Пример #23
0
 def test_wine_quality_has_fewer_features_with_SVR_BFS_explained_variance_5(
         self):
     # load molecules.csv from disk
     folder_name = "datasets"
     file_name = "winequality-white.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     # get dataframe using LoaderCreator
     scsv_type = self._loader_creator.create_loader(test_full_path, "scsv")
     df = scsv_type.get_file_transformed()
     # get x and y from SplitterReturner
     x, y = SplitterReturner.split_x_y_from_df(df)
     _, len_original_y = x.shape
     # create a feature selector
     fs = self._feature_selector_creator.create_feature_selector("BFS")
     # create a simple SVR estimator
     clf = self._estimator_creator.create_estimator("SVR")
     clf.set_params(gamma="auto")
     # get new_x with new features
     new_x, score = fs.select_features(x, y, clf, "explained_variance", 5)
     _, len_new_y = new_x.shape
     # does it have fewer features?
     is_fewer_than_original: bool = True if len_new_y < len_original_y else False
     # this should be True
     self.assertTrue(is_fewer_than_original)
Пример #24
0
 def test_iris_has_fewer_features_with_MEANSHIFT_BFS_mutual_info_score_10(
         self):
     # load molecules.csv from disk
     folder_name = "datasets"
     file_name = "iris.csv"
     test_full_path = ".\\..\\" + folder_name + "\\" + file_name
     # get dataframe using LoaderCreator
     csv_type = self._loader_creator.create_loader(test_full_path, "csv")
     df = csv_type.get_file_transformed()
     # get x and y from SplitterReturner
     x, y = SplitterReturner.split_x_y_from_df(df)
     _, len_original_y = x.shape
     # create a feature selector
     fs = self._feature_selector_creator.create_feature_selector("BFS")
     # create a simple MeanShift estimator
     clf = self._estimator_creator.create_estimator("MeanShift")
     # get new_x with new features
     new_x, score = fs.select_features(x, y, clf, "mutual_info_score", 10)
     print(new_x.columns.values, f"\n{score}")
     _, len_new_y = new_x.shape
     # does it have fewer features?
     is_fewer_than_original: bool = True if len_new_y < len_original_y else False
     # for this dataset and estimator with bfs all of the features are necessary
     print(is_fewer_than_original)