示例#1
0
    def _apply_default_pipeline_settings(pipeline):
        from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector
        from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector
        from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector
        from autoPyTorch.pipeline.nodes.train_node import TrainNode
        from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation

        import torch.nn as nn
        from autoPyTorch.components.metrics.standard_metrics import mean_distance

        AutoNetFeatureData._apply_default_pipeline_settings(pipeline)

        net_selector = pipeline[NetworkSelector.get_name()]
        net_selector.add_final_activation('none', nn.Sequential())

        loss_selector = pipeline[LossModuleSelector.get_name()]
        loss_selector.add_loss_module('l1_loss', nn.L1Loss)

        metric_selector = pipeline[MetricSelector.get_name()]
        metric_selector.add_metric('mean_distance', mean_distance)

        train_node = pipeline[TrainNode.get_name()]
        train_node.default_minimize_value = True

        cv = pipeline[CrossValidation.get_name()]
        cv.use_stratified_cv_split_default = False
    def _apply_default_pipeline_settings(pipeline):
        from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector
        from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector
        from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector
        from autoPyTorch.pipeline.nodes.train_node import TrainNode
        from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation

        import torch.nn as nn
        from autoPyTorch.components.metrics.standard_metrics import multilabel_accuracy
        from autoPyTorch.components.preprocessing.loss_weight_strategies import LossWeightStrategyWeightedBinary

        AutoNetFeatureData._apply_default_pipeline_settings(pipeline)

        net_selector = pipeline[NetworkSelector.get_name()]
        net_selector.add_final_activation('sigmoid', nn.Sigmoid())

        loss_selector = pipeline[LossModuleSelector.get_name()]
        loss_selector.add_loss_module('bce_with_logits', nn.BCEWithLogitsLoss,
                                      None, False)
        loss_selector.add_loss_module('bce_with_logits_weighted',
                                      nn.BCEWithLogitsLoss,
                                      LossWeightStrategyWeightedBinary(),
                                      False)

        metric_selector = pipeline[MetricSelector.get_name()]
        metric_selector.add_metric('multilabel_accuracy', multilabel_accuracy)

        train_node = pipeline[TrainNode.get_name()]
        train_node.default_minimize_value = False

        cv = pipeline[CrossValidation.get_name()]
        cv.use_stratified_cv_split_default = False
示例#3
0
    def fit(self, pipeline_config, hyperparameter_config, X, Y, train_indices, valid_indices):
        hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config)
        logger = logging.getLogger('autonet')
        
        if hyperparameter_config['target_size_strategy'] == 'none':
            return dict()

        over_sampling_method = self.over_sampling_methods[hyperparameter_config['over_sampling_method']](
            ConfigWrapper(hyperparameter_config['over_sampling_method'], hyperparameter_config)
        )
        under_sampling_method = self.under_sampling_methods[hyperparameter_config['under_sampling_method']](
            ConfigWrapper(hyperparameter_config['under_sampling_method'], hyperparameter_config)
        )
        target_size_strategy = self.target_size_strategies[hyperparameter_config['target_size_strategy']]()

        y = np.argmax(Y[train_indices], axis=1).astype(int)
        ohe = OneHotEncoder(categories="auto", sparse=False)
        ohe.fit(y.reshape((-1, 1)))

        over_sampling_target_size = target_size_strategy.over_sample_strategy(y)
        under_sampling_target_size = target_size_strategy.under_sample_strategy(y)

        logger.debug("Distribution before resample: " + str(np.unique(y, return_counts=True)[1]))
        X_resampled, y_resampled = over_sampling_method.resample(X[train_indices], y, over_sampling_target_size, pipeline_config["random_seed"])
        X_resampled, y_resampled  = under_sampling_method.resample(X_resampled, y_resampled, under_sampling_target_size, pipeline_config["random_seed"])
        logger.debug("Distribution after resample: " + str(np.unique(y_resampled, return_counts=True)[1]))

        if valid_indices is None:
            return {"X": X_resampled, "Y": ohe.transform(y_resampled.reshape((-1, 1))), "train_indices": np.array(list(range(X_resampled.shape[0])))}

        X, Y, split_indices = CrossValidation.get_validation_set_split_indices(pipeline_config,
                X_train=X_resampled, X_valid=X[valid_indices],
                Y_train=ohe.transform(y_resampled.reshape((-1, 1))), Y_valid=Y[valid_indices], allow_shuffle=False)
        return {"X": X, "Y": Y, "train_indices": split_indices[0], "valid_indices": split_indices[1]}
    def _apply_default_pipeline_settings(pipeline):
        from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector
        from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector
        from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector
        from autoPyTorch.pipeline.nodes.train_node import TrainNode
        from autoPyTorch.pipeline.nodes.resampling_strategy_selector import ResamplingStrategySelector
        from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation
        from autoPyTorch.pipeline.nodes.one_hot_encoding import OneHotEncoding
        from autoPyTorch.pipeline.nodes.resampling_strategy_selector import ResamplingStrategySelector
        from autoPyTorch.components.preprocessing.resampling import RandomOverSamplingWithReplacement, RandomUnderSamplingWithReplacement, SMOTE, \
            TargetSizeStrategyAverageSample, TargetSizeStrategyDownsample, TargetSizeStrategyMedianSample, TargetSizeStrategyUpsample

        import torch.nn as nn
        from autoPyTorch.components.metrics.standard_metrics import accuracy
        from autoPyTorch.components.preprocessing.loss_weight_strategies import LossWeightStrategyWeighted

        AutoNetFeatureData._apply_default_pipeline_settings(pipeline)

        net_selector = pipeline[NetworkSelector.get_name()]
        net_selector.add_final_activation('softmax', nn.Softmax(1))

        loss_selector = pipeline[LossModuleSelector.get_name()]
        loss_selector.add_loss_module('cross_entropy', nn.CrossEntropyLoss,
                                      None, True)
        loss_selector.add_loss_module('cross_entropy_weighted',
                                      nn.CrossEntropyLoss,
                                      LossWeightStrategyWeighted(), True)

        metric_selector = pipeline[MetricSelector.get_name()]
        metric_selector.add_metric('accuracy', accuracy)

        resample_selector = pipeline[ResamplingStrategySelector.get_name()]
        resample_selector.add_over_sampling_method(
            'random', RandomOverSamplingWithReplacement)
        resample_selector.add_over_sampling_method('smote', SMOTE)
        resample_selector.add_under_sampling_method(
            'random', RandomUnderSamplingWithReplacement)
        resample_selector.add_target_size_strategy('upsample',
                                                   TargetSizeStrategyUpsample)
        resample_selector.add_target_size_strategy(
            'downsample', TargetSizeStrategyDownsample)
        resample_selector.add_target_size_strategy(
            'average', TargetSizeStrategyAverageSample)
        resample_selector.add_target_size_strategy(
            'median', TargetSizeStrategyMedianSample)

        train_node = pipeline[TrainNode.get_name()]
        train_node.default_minimize_value = False

        cv = pipeline[CrossValidation.get_name()]
        cv.use_stratified_cv_split_default = True

        one_hot_encoding_node = pipeline[OneHotEncoding.get_name()]
        one_hot_encoding_node.encode_Y = True

        return pipeline
    def get_default_pipeline(cls):
        from autoPyTorch.pipeline.base.pipeline import Pipeline
        from autoPyTorch.pipeline.nodes.autonet_settings import AutoNetSettings
        from autoPyTorch.pipeline.nodes.optimization_algorithm import OptimizationAlgorithm
        from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation
        from autoPyTorch.pipeline.nodes.imputation import Imputation
        from autoPyTorch.pipeline.nodes.normalization_strategy_selector import NormalizationStrategySelector
        from autoPyTorch.pipeline.nodes.one_hot_encoding import OneHotEncoding
        from autoPyTorch.pipeline.nodes.preprocessor_selector import PreprocessorSelector
        from autoPyTorch.pipeline.nodes.resampling_strategy_selector import ResamplingStrategySelector
        from autoPyTorch.pipeline.nodes.embedding_selector import EmbeddingSelector
        from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector
        from autoPyTorch.pipeline.nodes.optimizer_selector import OptimizerSelector
        from autoPyTorch.pipeline.nodes.lr_scheduler_selector import LearningrateSchedulerSelector
        from autoPyTorch.pipeline.nodes.log_functions_selector import LogFunctionsSelector
        from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector
        from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector
        from autoPyTorch.pipeline.nodes.train_node import TrainNode

        # build the pipeline
        pipeline = Pipeline([
            AutoNetSettings(),
            OptimizationAlgorithm([
                CrossValidation([
                    Imputation(),
                    NormalizationStrategySelector(),
                    OneHotEncoding(),
                    PreprocessorSelector(),
                    ResamplingStrategySelector(),
                    EmbeddingSelector(),
                    NetworkSelector(),
                    OptimizerSelector(),
                    LearningrateSchedulerSelector(),
                    LogFunctionsSelector(),
                    MetricSelector(),
                    LossModuleSelector(),
                    TrainNode()
                ])
            ])
        ])

        cls._apply_default_pipeline_settings(pipeline)
        return pipeline
    def test_cross_validation(self):

        class ResultNode(PipelineNode):
            def fit(self, X, Y, train_indices, valid_indices):
                return { 'loss': np.sum(X[valid_indices]), 'info': {'a': np.sum(X[train_indices]), 'b': np.sum(X[valid_indices])} }

        pipeline = Pipeline([
            CrossValidation([
                ResultNode()
            ])
        ])

        pipeline["CrossValidation"].add_cross_validator("k_fold", KFold, lambda x: x.reshape((-1 ,)))
        pipeline["CrossValidation"].add_cross_validator("stratified_k_fold", StratifiedKFold, lambda x: x.reshape((-1 ,)))

        x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        y_train = np.array([[1], [0], [1]])

        # test cv_splits
        pipeline_config = pipeline.get_pipeline_config(cross_validator="k_fold", cross_validator_args={"n_splits": 3})
        pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config)
        dataset_info = DataSetInfo()
        dataset_info.categorical_features = [None] * 3
        dataset_info.x_shape = x_train.shape
        dataset_info.y_shape = y_train.shape
        pipeline_config["random_seed"] = 42

        cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, 
                                          X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, 
                                          budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None,
                                          optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False)

        self.assertEqual(cv_result['loss'], 15)
        self.assertDictEqual(cv_result['info'], {'a': 30, 'b': 15})

        
        # test validation split
        pipeline_config = pipeline.get_pipeline_config(validation_split=0.3)
        pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config)
        pipeline_config['random_seed'] = 42
        dataset_info = DataSetInfo()
        dataset_info.categorical_features = [None] * 3
        dataset_info.x_shape = x_train.shape
        dataset_info.y_shape = y_train.shape

        cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, 
                                          X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, 
                                          budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None,
                                          optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False)

        self.assertEqual(cv_result['loss'], 24)
        self.assertDictEqual(cv_result['info'], {'a': 21, 'b': 24})


        # test stratified cv split
        x_valid = x_train
        y_valid = y_train
        x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15], [16, 17, 18]])
        y_train = np.array([[1], [1], [0], [0], [1], [0]])

        pipeline_config = pipeline.get_pipeline_config(cross_validator="stratified_k_fold", cross_validator_args={"n_splits": 3})
        pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config)
        pipeline_config['random_seed'] = 42
        dataset_info = DataSetInfo()
        dataset_info.categorical_features = [None] * 3
        dataset_info.x_shape = x_train.shape
        dataset_info.y_shape = y_train.shape

        cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, 
                                          X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, 
                                          budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None,
                                          optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False)

        self.assertEqual(cv_result['loss'], 57)
        self.assertDictEqual(cv_result['info'], {'a': 114, 'b': 57})

        pipeline_config = pipeline.get_pipeline_config()
        pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config)
        pipeline_config['random_seed'] = 42
        dataset_info = DataSetInfo()
        dataset_info.categorical_features = [None] * 3
        dataset_info.x_shape = x_train.shape
        dataset_info.y_shape = y_train.shape

        cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, 
                                          X_train=x_train, Y_train=y_train, X_valid=x_valid, Y_valid=y_valid, 
                                          budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None,
                                          optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False)

        self.assertEqual(cv_result['loss'], 45)
        self.assertDictEqual(cv_result['info'], {'a': 171, 'b': 45})
示例#7
0
    def test_cross_validation(self):
        class ResultNode(PipelineNode):
            def fit(self, X_train, X_valid):
                return {
                    'loss': np.sum(X_valid),
                    'info': {
                        'a': np.sum(X_train),
                        'b': np.sum(X_valid)
                    }
                }

        pipeline = Pipeline([CrossValidation([ResultNode()])])

        x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        y_train = np.array([[1], [0], [1]])

        # test cv_splits
        pipeline_config = pipeline.get_pipeline_config(cv_splits=3)
        pipeline_config_space = pipeline.get_hyperparameter_search_space(
            **pipeline_config)
        pipeline_config['categorical_features'] = None

        cv_result = pipeline.fit_pipeline(
            hyperparameter_config=pipeline_config_space,
            pipeline_config=pipeline_config,
            X_train=x_train,
            Y_train=y_train,
            X_valid=None,
            Y_valid=None,
            budget=5,
            budget_type=BudgetTypeEpochs,
            one_hot_encoder=None,
            optimize_start_time=time.time())

        self.assertEqual(cv_result['loss'], 15)
        self.assertDictEqual(cv_result['info'], {'a': 30, 'b': 15})

        # test validation split
        pipeline_config = pipeline.get_pipeline_config(validation_split=0.3)
        pipeline_config_space = pipeline.get_hyperparameter_search_space(
            **pipeline_config)
        pipeline_config['categorical_features'] = None

        cv_result = pipeline.fit_pipeline(
            hyperparameter_config=pipeline_config_space,
            pipeline_config=pipeline_config,
            X_train=x_train,
            Y_train=y_train,
            X_valid=None,
            Y_valid=None,
            budget=5,
            budget_type=BudgetTypeEpochs,
            one_hot_encoder=None,
            optimize_start_time=time.time())

        self.assertEqual(cv_result['loss'], 24)
        self.assertDictEqual(cv_result['info'], {'a': 21, 'b': 24})

        # test stratified cv split
        x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12],
                            [13, 14, 15], [16, 17, 18]])
        y_train = np.array([[1], [1], [0], [0], [1], [0]])

        pipeline_config = pipeline.get_pipeline_config(
            cv_splits=3, use_stratified_cv_split=True)
        pipeline_config_space = pipeline.get_hyperparameter_search_space(
            **pipeline_config)
        pipeline_config['categorical_features'] = None

        cv_result = pipeline.fit_pipeline(
            hyperparameter_config=pipeline_config_space,
            pipeline_config=pipeline_config,
            X_train=x_train,
            Y_train=y_train,
            X_valid=None,
            Y_valid=None,
            budget=5,
            budget_type=BudgetTypeEpochs,
            one_hot_encoder=None,
            optimize_start_time=time.time())

        self.assertEqual(cv_result['loss'], 57)
        self.assertDictEqual(cv_result['info'], {'a': 114, 'b': 57})