def test_pipeline_optimize_works_fine(self): pipeline = Pipeline(feature_selection_algorithm=SelectKBest(), feature_transform_algorithm=Normalizer(), classifier=RandomForest()) data_reader = CSVDataReader( src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True) self.assertIsInstance(pipeline.get_classifier(), RandomForest) self.assertIsInstance(pipeline.get_feature_selection_algorithm(), SelectKBest) self.assertIsInstance(pipeline.get_feature_transform_algorithm(), Normalizer) accuracy = pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 20, 40, 'ParticleSwarmAlgorithm', 'Accuracy') if accuracy != float('inf'): self.assertGreaterEqual(accuracy, -1.0) self.assertLessEqual(accuracy, 0.0) self.assertIsInstance(pipeline.get_classifier(), RandomForest) self.assertIsInstance(pipeline.get_feature_selection_algorithm(), SelectKBest) self.assertIsInstance(pipeline.get_feature_transform_algorithm(), Normalizer)
def setUp(self): self.__data = CSVDataReader( src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True) self.__x_train, self.__x_test, self.__y_train, self.__y_test = train_test_split( self.__data.get_x(), self.__data.get_y(), test_size=0.2)
def test_pipeline_optimizer_missing_values_categorical_attributes_run_works_fine( self): data_reader = CSVDataReader( src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes_cat_miss.csv', has_header=True, contains_classes=True) ppo = PipelineOptimizer( data=self.__data_reader, feature_selection_algorithms=['SelectKBest', 'SelectPercentile'], feature_transform_algorithms=['Normalizer', 'StandardScaler'], classifiers=['AdaBoost', 'Bagging'], categorical_features_encoder='OneHotEncoder', imputer='SimpleImputer', log=False) pipeline = ppo.run('Accuracy', 10, 10, 20, 20, 'ParticleSwarmAlgorithm') self.assertIsInstance(pipeline, Pipeline) self.assertTrue( isinstance(pipeline.get_classifier(), AdaBoost) or isinstance(pipeline.get_classifier(), Bagging)) self.assertTrue( isinstance(pipeline.get_feature_selection_algorithm(), SelectKBest) or isinstance(pipeline.get_feature_selection_algorithm(), SelectPercentile)) self.assertTrue( pipeline.get_feature_transform_algorithm() is None or isinstance( pipeline.get_feature_transform_algorithm(), Normalizer) or isinstance(pipeline.get_feature_transform_algorithm(), StandardScaler))
def analyze_data( self, src: str, fitness_name: str, population_size: uint, number_of_evaluations: uint, optimization_algorithm: str, classifiers: Iterable, feature_selection_algorithms: Iterable = None, feature_transform_algorithms: Iterable = None, imputer: str = None, ) -> Pipeline: """ Method for running AutoML process using NiaAML PipelineOptimizer class instance.\n Args: src (str): path to a CSV file fitness_name (str): name of the fitness class to use as a function population_size (uint): number of individuals in the optimization process number_of_evaluations (uint): number of maximum evaluations optimization_algorithm (str): name of the optimization algorithm to use classifiers (Iterable[Classifier]): array of names of possible classifiers feature_selection_algorithms (Optional[Iterable[str]]): array of names of possible feature selection algorithms feature_transform_algorithms (Optional[Iterable[str]]): array of names of possible feature transform algorithms imputer (Optional[str]): name of the imputer used for features that contain missing values Returns: Pipeline: instance of Pipeline object from the NiaAML framework Note: See NiaAML's documentation for more details on possible input parameters' values and further usage of the returned Pipeline object. """ data = CSVDataReader(src=src, contains_classes=True, has_header=True) pipeline_optimizer = PipelineOptimizer( data=data, classifiers=classifiers, feature_selection_algorithms=feature_selection_algorithms, feature_transform_algorithms=feature_transform_algorithms, imputer=imputer, ) pipeline = pipeline_optimizer.run_v1( fitness_name, population_size, number_of_evaluations, optimization_algorithm ) return pipeline
def run(self): dataReader = CSVDataReader(src=self.__data.csvSrc, has_header=self.__data.csvHasHeader) optimizer = PipelineOptimizer( data=dataReader, feature_selection_algorithms=self.__data.fsas, feature_transform_algorithm=self.__data.ftas, classifiers=self.__data.classifiers, categorical_features_encoder=self.__data.encoder, imputer=self.__data.imputer ) optimizer._PipelineOptimizer__logger = HackyLogger(self.progress.emit) if self.__data.isOptimization is True: pipeline = optimizer.run(self.__data.fitnessFunctionName, self.__data.popSize, self.__data.popSizeInner, self.__data.numEvals, self.__data.numEvalsInner, self.__data.optAlgName, self.__data.optAlgInnerName) else: pipeline = optimizer.run_v1(self.__data.fitnessFunctionName, self.__data.popSize, self.__data.numEvals, self.__data.optAlgName) pipeline.export(os.path.join(self.__data.outputFolder, 'niaamlGUIoutput')) pipeline.export_text(os.path.join(self.__data.outputFolder, 'niaamlGUIoutput')) self.optimized.emit(pipeline.to_string())
def test_pipeline_run_works_fine(self): pipeline = Pipeline(feature_selection_algorithm=SelectKBest(), feature_transform_algorithm=Normalizer(), classifier=RandomForest()) data_reader = CSVDataReader( src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True) pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 20, 40, 'ParticleSwarmAlgorithm', 'Accuracy') predicted = pipeline.run( pandas.DataFrame( numpy.random.uniform(low=0.0, high=15.0, size=(30, data_reader.get_x().shape[1])))) self.assertEqual(predicted.shape, (30, )) s1 = set(data_reader.get_y()) s2 = set(predicted) self.assertTrue(s2.issubset(s1)) self.assertTrue(len(s2) > 0 and len(s2) <= 2)
class FeatureSelectionTestCase(TestCase): def setUp(self): self.__data = CSVDataReader( src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True) def test_PSO_works_fine(self): algo = fs.ParticleSwarmOptimization() selected_features_mask = algo.select_features(self.__data.get_x(), self.__data.get_y()) self.assertEqual(self.__data.get_x().shape[1], len(selected_features_mask)) def test_select_k_best_works_fine(self): algo = fs.SelectKBest() selected_features_mask = algo.select_features(self.__data.get_x(), self.__data.get_y()) self.assertEqual(self.__data.get_x().shape[1], len(selected_features_mask)) def test_select_percentile_works_fine(self): algo = fs.SelectPercentile() selected_features_mask = algo.select_features(self.__data.get_x(), self.__data.get_y()) self.assertEqual(self.__data.get_x().shape[1], len(selected_features_mask)) def test_bat_algorithm_works_fine(self): algo = fs.BatAlgorithm() selected_features_mask = algo.select_features(self.__data.get_x(), self.__data.get_y()) self.assertEqual(self.__data.get_x().shape[1], len(selected_features_mask)) def test_de_works_fine(self): algo = fs.DifferentialEvolution() selected_features_mask = algo.select_features(self.__data.get_x(), self.__data.get_y()) self.assertEqual(self.__data.get_x().shape[1], len(selected_features_mask)) def test_gwo_works_fine(self): algo = fs.GreyWolfOptimizer() selected_features_mask = algo.select_features(self.__data.get_x(), self.__data.get_y()) self.assertEqual(self.__data.get_x().shape[1], len(selected_features_mask)) def test_jdefsth_works_fine(self): algo = fs.jDEFSTH() selected_features_mask = algo.select_features(self.__data.get_x(), self.__data.get_y()) self.assertEqual(self.__data.get_x().shape[1], len(selected_features_mask)) def test_vt_works_fine(self): algo = fs.VarianceThreshold() selected_features_mask = algo.select_features(self.__data.get_x(), self.__data.get_y()) self.assertEqual(self.__data.get_x().shape[1], len(selected_features_mask))
class ClassifierTestCase(TestCase): def setUp(self): self.__data = CSVDataReader( src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True) self.__x_train, self.__x_test, self.__y_train, self.__y_test = train_test_split( self.__data.get_x(), self.__data.get_y(), test_size=0.2) def test_adaboost_works_fine(self): algo = c.AdaBoost() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape) def test_bagging_works_fine(self): algo = c.Bagging() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape) def test_ert_works_fine(self): algo = c.ExtremelyRandomizedTrees() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape) def test_lsvc_works_fine(self): algo = c.LinearSVC() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape) def test_mlp_works_fine(self): algo = c.MultiLayerPerceptron() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape) def test_rf_works_fine(self): algo = c.RandomForest() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape) def test_dt_works_fine(self): algo = c.DecisionTree() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape) def test_kn_works_fine(self): algo = c.KNeighbors() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape) def test_gp_works_fine(self): algo = c.GaussianProcess() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape) def test_gnb_works_fine(self): algo = c.GaussianNB() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape) def test_qda_works_fine(self): algo = c.QuadraticDiscriminantAnalysis() algo.fit(self.__x_train, self.__y_train) predictions = algo.predict(self.__x_test) self.assertEqual(predictions.shape, self.__y_test.shape)
from niaaml.fitness import Precision from niaaml.data import CSVDataReader import os import numpy """ This example presents how to use an implemented fitness function and its method individually. In this case, we use Precision for demonstration, but you can use any of the implemented fitness functions in the same way. """ # prepare data reader using csv file data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True) # lets say the following array contains predictions after the classification process predictions=numpy.random.choice(['Class 1', 'Class 2'], size=data_reader.get_y().shape) # instantiate instance of a fitness function (Precision in this case) fitness_func = Precision() # calculate fitness value precision = fitness_func.get_fitness(predictions, data_reader.get_y()) # precision will probably be low due to dummy data print(precision)
from niaaml.preprocessing.feature_transform import Normalizer import os from niaaml.data import CSVDataReader """ This example presents how to individually use an implemented feature transform algorithm and its methods individually. In this case, we use Normalizer for demonstration, but you can use any of the implemented feature transform algorithms in the same way. """ # prepare data reader using csv file data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True) # instantiate Normalizer ft = Normalizer() # set parameters of the Normalizer ft.set_parameters(norm='l2') # fit the algorithm to the input data ft.fit(data_reader.get_x()) # transform features transformed_features = ft.transform(data_reader.get_x()) # print feature transform algorithm in a user-friendly form print(ft.to_string())
from niaaml.preprocessing.feature_selection import SelectKBest import os from niaaml.data import CSVDataReader from sklearn.feature_selection import chi2 """ This example presents how to use an implemented feature selection algorithm and its methods individually. In this case, we use SelectKBest for demonstration, but you can use any of the implemented feature selection algorithms in the same way. """ # prepare data reader using csv file data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True) # instantiate SelectKBest feature selection algorithms fs = SelectKBest() # set parameters of the object fs.set_parameters(k=4, score_func=chi2) # select best features according to the SelectKBest algorithm (returns boolean mask of the selected features - True if selected, False if not) features_mask = fs.select_features(data_reader.get_x(), data_reader.get_y()) # print feature selection algorithm in a user-friendly form print(fs.to_string())
import os from niaaml.data import CSVDataReader """ This example presents how to instantiate CSVDataReader and use its methods. You can use it to contain data in a single variable, or as an input to an instance of the PipelineOptimizer class. """ # CSVDataReader gets a path to csv file on the input, reads and parses it into the x and y arrays # has_header and contains_classes arguments needs to be set according to the input csv file's structure data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True) # get x and y arrays and print them print(data_reader.get_x()) print(data_reader.get_y())
def run(self): dataReader = CSVDataReader(src=self.__data.csvSrc, contains_classes=False, has_header=self.__data.csvHasHeader) pipeline = Pipeline.load(self.__data.pipelineSrc) predictions = pipeline.run(dataReader.get_x()) self.ran.emit(str(predictions))
from niaaml.classifiers import MultiLayerPerceptron from niaaml.preprocessing.feature_selection import VarianceThreshold from niaaml.preprocessing.feature_transform import Normalizer from niaaml.data import CSVDataReader from niaaml.preprocessing.encoding import encode_categorical_features import os import numpy import pandas """ This example presents how to use the Pipeline class individually. You may use this if you want to test out a specific classification pipeline. We use a dataset that contains categorical and numerical features. """ # prepare data reader using csv file data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical.csv', has_header=False, contains_classes=True) # we use the utility method encode_categorical_features to get encoders for the categorical features, but you may instantiate and fit # feature encoders separately and pass them as an array (as long as they are implemented as this framework suggests) # there should be as many encoders as categorical features # this example uses One-Hot Encoding _, encoders = encode_categorical_features(data_reader.get_x(), 'OneHotEncoder') # instantiate a Pipeline object pipeline = Pipeline(feature_selection_algorithm=VarianceThreshold(), feature_transform_algorithm=Normalizer(), classifier=MultiLayerPerceptron(), categorical_features_encoders=encoders) # run pipeline optimization process (returns fitness value, but sets the best parameters for classifier, feature selection algorithm and feature transform algorithm during the process)
import os from niaaml import PipelineOptimizer, Pipeline from niaaml.data import CSVDataReader """ In this example, we show how to use the PipelineOptimizer class. This example is using an instance of CSVDataReader. The instantiated PipelineOptimizer will try and assemble the best pipeline with the components that are specified in its constructor. We use a dataset with 1 categorical feature and missing values to demonstrate a use of PipelineOptimizer instance with automatic feature encoding and imputation. """ # prepare data reader using csv file data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True) # instantiate PipelineOptimizer that chooses among specified classifiers, feature selection algorithms and feature transform algorithms # OneHotEncoder is used for encoding categorical features in this example # SimpleImputer is used for imputing missing values in this example # log is True by default, log_verbose means more information if True, log_output_file is the destination of a log file # if log_output_file is not provided there is no file created # if log is False, logging is turned off pipeline_optimizer = PipelineOptimizer( data=data_reader, classifiers=['AdaBoost', 'Bagging', 'MultiLayerPerceptron', 'RandomForest', 'ExtremelyRandomizedTrees', 'LinearSVC'], feature_selection_algorithms=['SelectKBest', 'SelectPercentile', 'ParticleSwarmOptimization', 'VarianceThreshold'], feature_transform_algorithms=['Normalizer', 'StandardScaler'], categorical_features_encoder='OneHotEncoder', imputer='SimpleImputer', log=True, log_verbose=True, log_output_file='output.log' )
from niaaml.classifiers import AdaBoost import os from niaaml.data import CSVDataReader import numpy """ In this example, we show how to individually use an implemented classifier and its methods. In this case we use AdaBoost for demonstration, but you can use any of the implemented classifiers in the same way. """ # prepare data reader using csv file data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True) # instantiate AdaBoost classifier classifier = AdaBoost() # set parameters of the classifier classifier.set_parameters(n_estimators=50, algorithm='SAMME') # fit classifier to the data classifier.fit(data_reader.get_x(), data_reader.get_y()) # predict classes of the dummy input predicted = classifier.predict( numpy.random.uniform(low=0.0, high=15.0, size=(30, data_reader.get_x().shape[1]))) # print classifier in a user-friendly form
from niaaml.preprocessing.feature_selection import VarianceThreshold from niaaml.preprocessing.feature_transform import Normalizer from niaaml.data import CSVDataReader from niaaml.preprocessing.encoding import encode_categorical_features from niaaml.preprocessing.imputation import impute_features import os import numpy import pandas """ This example presents how to use the Pipeline class individually. You may use this if you want to test out a specific classification pipeline. We use a dataset that contains categorical and numerical features with missing values. """ # prepare data reader using csv file data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True) features = data_reader.get_x() # we use the utility method impute_features to get imputers for the features with missing values, but you may instantiate and fit # imputers separately and pass them as a dictionary (as long as they are implemented as this framework suggests), with keys as column names or indices (if there is no header in the csv) # there should be as many imputers as the features with missing values # this example uses Simple Imputer features, imputers = impute_features(features, 'SimpleImputer') # exactly the same goes for encoders _, encoders = encode_categorical_features(features, 'OneHotEncoder') # instantiate a Pipeline object pipeline = Pipeline(feature_selection_algorithm=VarianceThreshold(),
from niaaml import Pipeline from niaaml.classifiers import MultiLayerPerceptron from niaaml.preprocessing.feature_selection import VarianceThreshold from niaaml.preprocessing.feature_transform import Normalizer from niaaml.data import CSVDataReader import os import numpy import pandas """ This example presents how to use the Pipeline class individually. You may use this if you want to test out a specific classification pipeline. """ # prepare data reader using csv file data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True) # instantiate a Pipeline object pipeline = Pipeline(feature_selection_algorithm=VarianceThreshold(), feature_transform_algorithm=Normalizer(), classifier=MultiLayerPerceptron()) # run pipeline optimization process (returns fitness value, but sets the best parameters for classifier, feature selection algorithm and feature transform algorithm during the process) pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 10, 50, 'ParticleSwarmAlgorithm', 'Accuracy') # run the pipeline using dummy data # you could run the pipeline before the optimization process, but get wrong predictions as nothing in the pipeline is fit for the given dataset predicted = pipeline.run( pandas.DataFrame(
class FeatureTransformTestCase(TestCase): def setUp(self): self.__data = CSVDataReader( src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True) def test_mas_works_fine(self): algo = ft.MaxAbsScaler() algo.fit(self.__data.get_x()) transformed = algo.transform(self.__data.get_x()) self.assertEqual(transformed.shape, self.__data.get_x().shape) def test_norm_works_fine(self): algo = ft.Normalizer() algo.fit(self.__data.get_x()) transformed = algo.transform(self.__data.get_x()) self.assertEqual(transformed.shape, self.__data.get_x().shape) def test_qt_works_fine(self): algo = ft.QuantileTransformer() algo.fit(self.__data.get_x()) transformed = algo.transform(self.__data.get_x()) self.assertEqual(transformed.shape, self.__data.get_x().shape) def test_rs_works_fine(self): algo = ft.RobustScaler() algo.fit(self.__data.get_x()) transformed = algo.transform(self.__data.get_x()) self.assertEqual(transformed.shape, self.__data.get_x().shape) def test_ss_works_fine(self): algo = ft.StandardScaler() algo.fit(self.__data.get_x()) transformed = algo.transform(self.__data.get_x()) self.assertEqual(transformed.shape, self.__data.get_x().shape)
def test_header_classes_works_fine(self): data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True) x = data_reader.get_x() y = data_reader.get_y() self.assertEqual(x.shape, (100, 6)) self.assertEqual(y.shape, (100, ))
def setUp(self): self.__data = CSVDataReader( src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True)