Exemplo n.º 1
0
 def setUp(self):
     self.pca_pipe_element = PipelineElement.create(
         'pca', {'n_components': [1, 2]}, test_disabled=True)
     self.svc_pipe_element = PipelineElement.create('svc', {
         'C': [0.1, 1],
         'kernel': ['rbf', 'sigmoid']
     })
     self.cv_object = KFold(n_splits=3)
     self.hyperpipe = Hyperpipe('god', self.cv_object)
     self.hyperpipe += self.pca_pipe_element
     self.hyperpipe.add(self.svc_pipe_element)
Exemplo n.º 2
0
    def setUp(self):
        # set up inner pipeline
        self.inner_hyperpipe = Hyperpipe('inner_pipe',
                                         KFold(n_splits=2),
                                         local_search=True)
        self.inner_pipeline_test_element = PipelineElement.create(
            'test_wrapper')
        self.inner_hyperpipe += self.inner_pipeline_test_element
        self.pipeline_fusion = PipelineStacking('fusion_element',
                                                [self.inner_hyperpipe])
        # set up outer pipeline
        self.outer_hyperpipe = Hyperpipe('outer_pipe', KFold(n_splits=2))
        self.outer_pipeline_test_element = PipelineElement.create(
            'test_wrapper')
        self.outer_hyperpipe += self.outer_pipeline_test_element
        self.outer_hyperpipe += self.pipeline_fusion

        self.X = np.arange(1, 101)
        self.y = np.ones((100, ))
Exemplo n.º 3
0
    def testCaseA(self):
        pca_n_components = [2, 5]
        svc_c = [.1, 1, 5]
        #svc_kernel = ['rbf']
        svc_kernel = ['rbf', 'linear']

        # SET UP HYPERPIPE
        my_pipe = Hyperpipe('primary_pipe',
                            optimizer='grid_search',
                            optimizer_params={},
                            metrics=['accuracy', 'precision', 'f1_score'],
                            inner_cv=KFold(n_splits=2, random_state=3),
                            eval_final_performance=False)

        my_pipe += PipelineElement.create('standard_scaler')
        my_pipe += PipelineElement.create('pca',
                                          {'n_components': pca_n_components})
        my_pipe += PipelineElement.create('svc', {
            'C': svc_c,
            'kernel': svc_kernel
        })

        # START HYPERPARAMETER SEARCH
        my_pipe.fit(self.__X, self.__y)
        print(my_pipe._test_performances)
        pipe_results = {'train': [], 'test': []}
        for i in range(len(my_pipe._performance_history_list)):
            pipe_results['train'].extend(my_pipe._performance_history_list[i]
                                         ['accuracy_folds']['train'])
            pipe_results['test'].extend(
                my_pipe._performance_history_list[i]['accuracy_folds']['test'])

        print('\n\n')
        print('Running sklearn version...')
        #cv_outer = KFold(n_splits=2, random_state=3)
        cv_inner_1 = KFold(n_splits=2, random_state=3)

        sk_results = {'train': [], 'test': []}

        for n_comp in pca_n_components:
            for c in svc_c:
                for current_kernel in svc_kernel:
                    tr_acc = []
                    val_acc = []
                    for train_2, val_1 in cv_inner_1.split(self.__X):

                        data_train_2 = self.__X[train_2]
                        print(data_train_2.shape)
                        data_val_1 = self.__X[val_1]
                        y_train_2 = self.__y[train_2]
                        y_val_1 = self.__y[val_1]

                        my_scaler = StandardScaler()
                        my_scaler.fit(data_train_2)
                        data_train_2 = my_scaler.transform(data_train_2)
                        data_val_1 = my_scaler.transform(data_val_1)

                        # Run PCA
                        my_pca = PCA(n_components=n_comp)
                        my_pca.fit(data_train_2)
                        data_tr_2_pca = my_pca.transform(data_train_2)
                        data_val_1_pca = my_pca.transform(data_val_1)

                        # Run SVC
                        my_svc = SVC(kernel=current_kernel, C=c)
                        my_svc.fit(data_tr_2_pca, y_train_2)

                        tr_acc.append(my_svc.score(data_tr_2_pca, y_train_2))
                        val_acc.append(my_svc.score(data_val_1_pca, y_val_1))
                        print('n_components: ', n_comp, 'kernel:',
                              current_kernel, 'c:', c)
                        print('Training 2:', tr_acc[-1], 'validation 1:',
                              val_acc[-1])

                    sk_results['train'].extend(tr_acc)
                    sk_results['test'].extend(val_acc)

        print('\nCompare results of last iteration (outer cv)...')
        print('SkL  Train:', sk_results['train'])
        print('Pipe Train:', pipe_results['train'])
        print('SkL  test: ', sk_results['test'])
        print('Pipe test: ', pipe_results['test'])

        self.assertEqual(sk_results['test'], pipe_results['test'])
        self.assertEqual(sk_results['train'], pipe_results['train'])
Exemplo n.º 4
0
class CVTestsLocalSearchTrue(unittest.TestCase):
    def setUp(self):
        # set up inner pipeline
        self.inner_hyperpipe = Hyperpipe('inner_pipe',
                                         KFold(n_splits=2),
                                         local_search=True)
        self.inner_pipeline_test_element = PipelineElement.create(
            'test_wrapper')
        self.inner_hyperpipe += self.inner_pipeline_test_element
        self.pipeline_fusion = PipelineStacking('fusion_element',
                                                [self.inner_hyperpipe])
        # set up outer pipeline
        self.outer_hyperpipe = Hyperpipe('outer_pipe', KFold(n_splits=2))
        self.outer_pipeline_test_element = PipelineElement.create(
            'test_wrapper')
        self.outer_hyperpipe += self.outer_pipeline_test_element
        self.outer_hyperpipe += self.pipeline_fusion

        self.X = np.arange(1, 101)
        self.y = np.ones((100, ))

    def test_default_split_fit(self):
        """
        test default splitting mode: 80% validation and 20% testing
        make sure that DURING the optimization the optimum pipeline is fitted with the correct amount of data
        """
        self.outer_hyperpipe.debug_cv_mode = True
        self.inner_hyperpipe.debug_cv_mode = True
        self.outer_hyperpipe.hyperparameter_fitting_cv_object = ShuffleSplit(
            n_splits=1, test_size=0.2)
        self.inner_hyperpipe.hyperparameter_fitting_cv_object = ShuffleSplit(
            n_splits=1, test_size=0.2)

        self.outer_hyperpipe.fit(self.X, self.y)

        outer_data = self.outer_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()
        inner_data = self.inner_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()

        print('local_search true: outer pipeline data:')
        print(sorted(outer_data))

        print('local_search true: inner pipeline data:')
        print(sorted(inner_data))

        # we expect that all items from inner_data are existent in outer_data
        validation = set(inner_data) < set(outer_data)
        self.assertTrue(validation)
        # test that it is only 50% of 80% of original X (n=100) and that there is a test_x of 20% size
        self.assertEqual(len(outer_data), 40)
        # test that inner data is 50% from 80% of outer
        self.assertEqual(len(inner_data), 16)
        # we also expect that inner_data is 50% of length from outer_data
        self.assertEqual(len(inner_data), 0.5 * 0.8 * len(outer_data))

    def test_default_split_predict(self):
        """
        test default splitting mode: 80% validation and 20% testing
        make sure that AFTER the optimization the optimum pipeline is fitted with the correct amount of data
        which means test that the optimum pipe is fitted to the validation data and tested with the test data
        """
        self.outer_hyperpipe.debug_cv_mode = False
        self.inner_hyperpipe.debug_cv_mode = False
        self.outer_hyperpipe.hyperparameter_fitting_cv_object = ShuffleSplit(
            n_splits=1, test_size=0.2, train_size=0.8)
        self.inner_hyperpipe.hyperparameter_fitting_cv_object = ShuffleSplit(
            n_splits=1, test_size=0.2, train_size=0.8)

        self.outer_hyperpipe.fit(self.X, self.y)

        print('local_search true: outer pipeline data:')
        print(self.outer_pipeline_test_element.base_element.data_dict['fit_X'])

        print('local_search true: inner pipeline data:')
        print(self.inner_pipeline_test_element.base_element.data_dict['fit_X'])

        outer_data = self.outer_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()
        inner_data = self.inner_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()
        # we expect that all items from inner_data are existent in outer_data
        validation = set(inner_data) < set(outer_data)
        self.assertTrue(validation)
        # test that it is only 80% of original X (n=100) and that there is a test_x of 20% size
        self.assertEqual(len(outer_data), 80)
        # test that inner data is 80% from 80% of original
        self.assertEqual(len(inner_data), 64)
        # we also expect that inner_data is 80% of length from outer_data
        self.assertEqual(len(inner_data), 0.8 * len(outer_data))

    def test_no_split(self):
        """
        test no splitting mode: the data is NOT split into test and validation set
        """
        self.outer_hyperpipe.debug_cv_mode = True
        self.outer_hyperpipe.eval_final_performance = False
        self.inner_hyperpipe.debug_cv_mode = True
        self.inner_hyperpipe.eval_final_performance = False

        self.outer_hyperpipe.fit(self.X, self.y)
        outer_data = self.outer_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()
        inner_data = self.inner_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()

        # we expect that all items from inner_data are existent in outer_data
        validation = set(inner_data) < set(outer_data)
        self.assertTrue(validation)
        # test that it is only 50% of original X (n=100)
        self.assertEqual(len(outer_data), 50)
        # test that inner data is 50% from 50% of outer = 25% of original
        self.assertEqual(len(inner_data), 25)

    def test_CV_split(self):
        """
        test cv splitting mode: the entire search for hyperparameters is cross validated
        """
        self.outer_hyperpipe.debug_cv_mode = True
        self.outer_hyperpipe.eval_final_performance = False
        self.outer_hyperpipe.hyperparameter_fitting_cv_object = KFold(
            n_splits=2)
        self.inner_hyperpipe.debug_cv_mode = True
        self.inner_hyperpipe.eval_final_performance = False
        self.inner_hyperpipe.hyperparameter_fitting_cv_object = KFold(
            n_splits=2)

        self.outer_hyperpipe.fit(self.X, self.y)
        outer_data = self.outer_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()
        inner_data = self.inner_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()

        # we expect that all items from inner_data are existent in outer_data
        validation = set(inner_data) < set(outer_data)
        self.assertTrue(validation)
        # we use KFold = 2 so the original should be 100/2 = 50
        # test that it is only 50% of original X (n=50)
        self.assertEqual(len(outer_data), 25)
        # test that inner data is 25% of outer = 12,5% of original
        self.assertTrue((len(inner_data) == 6 or len(inner_data) == 7))
Exemplo n.º 5
0
class CVTestsLocalSearchFalse(unittest.TestCase):
    def setUp(self):
        self.outer_hyperpipe = Hyperpipe('outer_pipe', KFold(n_splits=2))

        # set up inner pipeline
        self.inner_hyperpipe = Hyperpipe(
            'inner_pipe',
            KFold(n_splits=2),
            optimizer=self.outer_hyperpipe.optimizer,
            local_search=False)
        self.inner_pipeline_test_element = PipelineElement.create(
            'test_wrapper')
        self.inner_hyperpipe += self.inner_pipeline_test_element
        self.pipeline_fusion = PipelineStacking('fusion_element',
                                                [self.inner_hyperpipe])

        # set up outer pipeline
        self.outer_pipeline_test_element = PipelineElement.create(
            'test_wrapper')
        self.outer_hyperpipe += self.outer_pipeline_test_element
        self.outer_hyperpipe += self.pipeline_fusion

        self.X = np.arange(1, 101)
        self.y = np.ones((100, ))

        self.inner_hyperpipe.debug_cv_mode = True
        self.outer_hyperpipe.debug_cv_mode = True

    def test_no_split(self):

        self.outer_hyperpipe.eval_final_performance = False
        self.inner_hyperpipe.eval_final_performance = False

        self.outer_hyperpipe.fit(self.X, self.y)

        print('local_search true: outer pipeline data:')
        print(self.outer_pipeline_test_element.base_element.data_dict['fit_X'])

        print('local_search true: inner pipeline data:')
        print(self.inner_pipeline_test_element.base_element.data_dict['fit_X'])

        outer_data = self.outer_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()
        inner_data = self.inner_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()

        self.assertTrue(set(outer_data) == set(inner_data))
        self.assertEqual(len(outer_data), 50)

    def test_default_split(self):

        self.outer_hyperpipe.eval_final_performance = True
        self.inner_hyperpipe.eval_final_performance = True

        self.outer_hyperpipe.fit(self.X, self.y)

        outer_data = self.outer_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()
        inner_data = self.inner_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()

        self.assertTrue(set(outer_data) == set(inner_data))
        self.assertEqual(len(outer_data), 40)

    def test_cv_split(self):

        self.outer_hyperpipe.hyperparameter_fitting_cv_object = KFold(
            n_splits=2)
        # should be ignored:
        self.inner_hyperpipe.hyperparameter_fitting_cv_object = KFold(
            n_splits=2)

        self.outer_hyperpipe.fit(self.X, self.y)

        outer_data = self.outer_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()
        inner_data = self.inner_pipeline_test_element.base_element.data_dict[
            'fit_X'].tolist()

        self.assertTrue(set(outer_data) == set(inner_data))
        self.assertEqual(len(outer_data), 25)
        self.assertEqual(len(outer_data), len(inner_data))
Exemplo n.º 6
0
    def testCaseA(self):
        pca_n_components = [2, 5]
        svc_c = [.1, 1]
        svc_kernel = ['rbf']
        # svc_kernel = ['rbf','linear']

        # SET UP HYPERPIPE
        my_pipe = Hyperpipe('primary_pipe', optimizer='grid_search',
                            optimizer_params={},
                            inner_cv=KFold(
                                n_splits=2, random_state=3),
                            outer_cv=KFold(
                                n_splits=2, random_state=3), verbose=2, eval_final_performance=True)

        my_pipe += PipelineElement.create('standard_scaler')
        my_pipe += PipelineElement.create('pca', {'n_components': pca_n_components})
        my_pipe += PipelineElement.create('svc', {'C': svc_c, 'kernel': svc_kernel})

        # START HYPERPARAMETER SEARCH
        my_pipe.fit(self.__X, self.__y)
        from Framework import LogExtractor
        log_ex = LogExtractor.LogExtractor(my_pipe.result_tree)
        log_ex.extract_csv("test_case_A2.csv")

        # print(my_pipe.test_performances)
        # pipe_results = {'train': [], 'test': []}
        # for i in range(len(my_pipe.performance_history_list)):
        #     pipe_results['train'].extend(
        #         my_pipe.performance_history_list[i]['accuracy_folds']['train'])
        #     pipe_results['test'].extend(
        #         my_pipe.performance_history_list[i]['accuracy_folds']['test'])

        print('\n\n')
        print('Running sklearn version...')
        cv_outer = KFold(n_splits=2, random_state=3)
        cv_inner_1 = KFold(n_splits=2, random_state=3)

        for train_1, test in cv_outer.split(self.__X):
            data_train_1 = self.__X[train_1]
            data_test = self.__X[test]
            y_train_1 = self.__y[train_1]
            y_test = self.__y[test]
            sk_results = {'train': [], 'test': []}

            for n_comp in pca_n_components:
                for current_kernel in svc_kernel:
                    for c in svc_c:
                        tr_acc = []
                        val_acc = []

                        for train_2, val_1 in cv_inner_1.split(
                                data_train_1):
                            data_train_2 = data_train_1[train_2]
                            data_val_1 = data_train_1[val_1]
                            y_train_2 = y_train_1[train_2]
                            y_val_1 = y_train_1[val_1]

                            my_scaler = StandardScaler()
                            my_scaler.fit(data_train_2)
                            data_train_2 = my_scaler.transform(data_train_2)
                            data_val_1 = my_scaler.transform(data_val_1)

                            # Run PCA
                            my_pca = PCA(n_components=n_comp)
                            my_pca.fit(data_train_2)
                            data_tr_2_pca = my_pca.transform(data_train_2)
                            data_val_1_pca = my_pca.transform(data_val_1)

                            # Run SVC
                            my_svc = SVC(kernel=current_kernel, C=c)
                            my_svc.fit(data_tr_2_pca, y_train_2)

                            tr_acc.append(my_svc.score(data_tr_2_pca, y_train_2))
                            val_acc.append(my_svc.score(data_val_1_pca, y_val_1))
                            print('n_components: ', n_comp, 'kernel:',
                                  current_kernel, 'c:', c)
                            print('Training 2:', tr_acc[-1],
                                  'validation 1:', val_acc[-1])

                        sk_results['train'].extend(tr_acc)
                        sk_results['test'].extend(val_acc)

        print('\nCompare results of last iteration (outer cv)...')
        print('SkL  Train:', sk_results['train'])
        print('Pipe Train:', pipe_results['train'])
        print('SkL  test: ', sk_results['test'])
        print('Pipe test: ', pipe_results['test'])

        self.assertEqual(sk_results['test'], pipe_results['test'])
        self.assertEqual(sk_results['train'], pipe_results['train'])
Exemplo n.º 7
0
    def testCaseB(self):
        pca_n_components = [7, 15, 10]
        svc_c = [.1, 1]
        #svc_kernel = ['rbf']
        svc_kernel = ['rbf', 'linear']
        cv_outer = ShuffleSplit(n_splits=1, test_size=0.2, random_state=3)
        cv_inner_1 = ShuffleSplit(n_splits=1, test_size=0.2, random_state=3)
        cv_inner_2 = ShuffleSplit(n_splits=1, test_size=0.2, random_state=3)

        # SET UP HYPERPIPE
        outer_pipe = Hyperpipe('outer_pipe',
                               optimizer='grid_search',
                               metrics=['accuracy'],
                               inner_cv=cv_inner_1,
                               outer_cv=cv_outer,
                               eval_final_performance=True)
        inner_pipe = Hyperpipe('pca_pipe',
                               optimizer='grid_search',
                               inner_cv=cv_inner_2,
                               eval_final_performance=False)

        inner_pipe.add(PipelineElement.create('standard_scaler'))
        inner_pipe.add(
            PipelineElement.create('ae_pca',
                                   {'n_components': pca_n_components}))

        pipeline_fusion = PipelineStacking('fusion_element', [inner_pipe])

        outer_pipe.add(pipeline_fusion)
        outer_pipe.add(
            PipelineElement.create('svc', {
                'C': svc_c,
                'kernel': svc_kernel
            }))

        # START HYPERPARAMETER SEARCH
        outer_pipe.fit(self.__X, self.__y)
        print(outer_pipe._test_performances)
        pipe_results = {'train': [], 'test': []}
        for i in range(len(outer_pipe._performance_history_list)):
            pipe_results['train'].extend(
                outer_pipe._performance_history_list[i]['accuracy_folds']
                ['train'])
            pipe_results['test'].extend(outer_pipe._performance_history_list[i]
                                        ['accuracy_folds']['test'])

        print(outer_pipe._test_performances['accuracy'])

        print('\n\n')
        print('Running sklearn version...\n')
        opt_tr_acc = []
        opt_test_acc = []

        for train_1, test in cv_outer.split(self.__X):
            data_train_1 = self.__X[train_1]
            data_test = self.__X[test]
            y_train_1 = self.__y[train_1]
            y_test = self.__y[test]
            config_inner_1 = {'C': [], 'kernel': []}
            sk_results_inner1 = {
                'train_2': [],
                'val_1': [],
                'train_2_mean': [],
                'val_1_mean': []
            }
            print('Outer Split')
            print('n train_1:', data_train_1.shape[0], '\n')

            for c in svc_c:
                for current_kernel in svc_kernel:
                    config_inner_1['C'].extend([c])
                    config_inner_1['kernel'].extend([current_kernel])

                    print('C:', c, 'Kernel:', current_kernel, '\n')
                    svc_score_tr = []
                    svc_score_te = []
                    fold_cnt = 1
                    for train_2, val_1 in cv_inner_1.split(data_train_1):
                        print('\n\nSklearn Outer Pipe FoldMetrics', fold_cnt)

                        data_train_2 = data_train_1[train_2]
                        data_val_1 = data_train_1[val_1]
                        y_train_2 = y_train_1[train_2]
                        y_val_1 = y_train_1[val_1]
                        print('n train_2:', data_train_2.shape[0], '\n')

                        config_inner_2 = {'n_comp': []}
                        print('Sklearn PCA Pipe')
                        sk_results_inner2 = {
                            'train_3': [],
                            'val_2': [],
                            'train_3_mean': [],
                            'val_2_mean': []
                        }
                        for n_comp in pca_n_components:
                            config_inner_2['n_comp'].extend([n_comp])

                            tr_acc = []
                            val_acc = []

                            # print('Some training data:',
                            #       data_train_2[0:2, 0:2])
                            for train_3, val_2 in cv_inner_2.split(
                                    data_train_2):

                                data_train_3 = data_train_2[train_3]
                                data_val_2 = data_train_2[val_2]

                                my_scaler = StandardScaler()
                                my_scaler.fit(data_train_3)
                                data_train_3 = my_scaler.transform(
                                    data_train_3)
                                data_val_2 = my_scaler.transform(data_val_2)

                                # Run PCA
                                my_pca = PCA_AE_Wrapper(n_components=n_comp)
                                my_pca.fit(data_train_3)

                                mae_tr = my_pca.score(data_train_3)
                                mae_te = my_pca.score(data_val_2)

                                tr_acc.append(mae_tr)
                                val_acc.append(mae_te)

                            sk_results_inner2['train_3'].extend(tr_acc)
                            sk_results_inner2['val_2'].extend(val_acc)
                            sk_results_inner2['train_3_mean'].extend(
                                [np.mean(tr_acc)])
                            sk_results_inner2['val_2_mean'].extend(
                                [np.mean(val_acc)])

                            print('n_comp:', n_comp)
                            print('n train_3 fold 1:', data_train_3.shape[0])
                            print('Training 3 mean:', [np.mean(tr_acc)],
                                  'validation 2 mean:', [np.mean(val_acc)])
                        # find best config for val 2
                        best_config_id = np.argmin(
                            sk_results_inner2['val_2_mean'])
                        print('Best PCA config:',
                              config_inner_2['n_comp'][best_config_id], '\n')
                        # fit optimum pipe

                        my_scaler = StandardScaler()
                        my_scaler.fit(data_train_2)
                        data_train_2 = my_scaler.transform(data_train_2)
                        data_val_1 = my_scaler.transform(data_val_1)

                        # Run PCA
                        my_pca = PCA_AE_Wrapper(
                            n_components=config_inner_2['n_comp']
                            [best_config_id])
                        my_pca.fit(data_train_2)
                        data_tr_2_pca = my_pca.transform(data_train_2)
                        data_val_1_pca = my_pca.transform(data_val_1)

                        # Run SVC
                        my_svc = SVC(kernel=current_kernel, C=c)
                        my_svc.fit(data_tr_2_pca, y_train_2)
                        svc_score_tr.append(
                            my_svc.score(data_tr_2_pca, y_train_2))
                        svc_score_te.append(
                            my_svc.score(data_val_1_pca, y_val_1))
                        print('Fit Optimum PCA Config and train with SVC')
                        print('n train 2:', data_train_2.shape[0])
                        print('n_comp:',
                              config_inner_2['n_comp'][best_config_id])
                        print('SVC Train:', svc_score_tr[-1])
                        print('SVC test:', svc_score_te[-1], '\n\n')
                        sk_results_inner1['train_2'].append(svc_score_tr[-1])
                        sk_results_inner1['val_1'].append(svc_score_te[-1])
                        fold_cnt += 1
                    sk_results_inner1['train_2_mean'].append(
                        np.mean(svc_score_tr))
                    sk_results_inner1['val_1_mean'].append(
                        np.mean(svc_score_te))

            print('\nNow find best config for SVC...')
            best_config_id_inner_1 = np.argmax(sk_results_inner1['val_1_mean'])
            print('Some test data:')
            print(data_test.shape)
            print(data_test[0:2, 0:2])

            # fit optimum pipe
            my_scaler = StandardScaler()
            my_scaler.fit(data_train_1)
            data_train_1 = my_scaler.transform(data_train_1)
            data_test = my_scaler.transform(data_test)

            # Run PCA
            my_pca = PCA_AE_Wrapper(
                n_components=config_inner_2['n_comp'][best_config_id])
            my_pca.fit(data_train_1)
            data_tr_1_pca = my_pca.transform(data_train_1)
            data_test_pca = my_pca.transform(data_test)

            # Run SVC
            my_svc = SVC(
                kernel=config_inner_1['kernel'][best_config_id_inner_1],
                C=config_inner_1['C'][best_config_id_inner_1])
            print('Best overall config:...')
            print('C = ', config_inner_1['C'][best_config_id_inner_1])
            print('kernel=', config_inner_1['kernel'][best_config_id_inner_1])
            print('pca_n_comp=', config_inner_2['n_comp'][best_config_id])
            print('n train 1:', data_train_1.shape[0])
            my_svc.fit(data_tr_1_pca, y_train_1)

            opt_tr_acc.append(my_svc.score(data_tr_1_pca, y_train_1))
            opt_test_acc.append(my_svc.score(data_test_pca, y_test))
            print('Train Acc:', opt_tr_acc[-1])
            print('test Acc:', opt_test_acc[-1])

        print('\nCompare results of last iteration (outer cv)...')
        print('SkL  Train:', sk_results_inner1['train_2'])
        print('Pipe Train:', pipe_results['train'])
        print('SkL  test: ', sk_results_inner1['val_1'])
        print('Pipe test: ', pipe_results['test'])
        print('\nEval final performance:')
        print('Pipe final perf:', outer_pipe._test_performances['accuracy'])
        print('Sklearn final perf:', opt_test_acc)
        self.assertEqual(sk_results_inner1['train_2'], pipe_results['train'])
        self.assertEqual(sk_results_inner1['val_1'], pipe_results['test'])
        self.assertEqual(opt_test_acc,
                         outer_pipe._test_performances['accuracy'])
Exemplo n.º 8
0
    def testCaseC2(self):
        pca_n_components = [5, 10]
        svc_c = [0.1]
        svc_c_2 = [1]
        #svc_kernel = ['rbf']
        svc_kernel = ['linear']

        # SET UP HYPERPIPE

        outer_pipe = Hyperpipe('outer_pipe',
                               optimizer='grid_search',
                               metrics=['accuracy'],
                               inner_cv=ShuffleSplit(n_splits=1,
                                                     test_size=0.2,
                                                     random_state=3),
                               outer_cv=ShuffleSplit(n_splits=1,
                                                     test_size=0.2,
                                                     random_state=3),
                               eval_final_performance=True)

        # Create pipe for first data source
        pipe_source_1 = Hyperpipe('source_1',
                                  optimizer='grid_search',
                                  inner_cv=ShuffleSplit(n_splits=1,
                                                        test_size=0.2,
                                                        random_state=3),
                                  eval_final_performance=False)

        pipe_source_1.add(
            PipelineElement.create('SourceSplitter',
                                   {'column_indices': [np.arange(0, 10)]}))
        pipe_source_1.add(
            PipelineElement.create('pca', {'n_components': pca_n_components}))
        pipe_source_1.add(
            PipelineElement.create('svc', {
                'C': svc_c,
                'kernel': svc_kernel
            }))

        # Create pipe for second data source
        pipe_source_2 = Hyperpipe('source_2',
                                  optimizer='grid_search',
                                  inner_cv=ShuffleSplit(n_splits=1,
                                                        test_size=0.2,
                                                        random_state=3),
                                  eval_final_performance=False)

        pipe_source_2.add(
            PipelineElement.create('SourceSplitter',
                                   {'column_indices': [np.arange(10, 20)]}))

        pipe_source_2.add(
            PipelineElement.create('pca', {'n_components': pca_n_components}))
        pipe_source_2.add(
            PipelineElement.create('svc', {
                'C': svc_c,
                'kernel': svc_kernel
            }))
        # Create pipe for third data source
        pipe_source_3 = Hyperpipe('source_3',
                                  optimizer='grid_search',
                                  inner_cv=ShuffleSplit(n_splits=1,
                                                        test_size=0.2,
                                                        random_state=3),
                                  eval_final_performance=False)

        pipe_source_3.add(
            PipelineElement.create('SourceSplitter',
                                   {'column_indices': [np.arange(20, 30)]}))
        pipe_source_3.add(
            PipelineElement.create('pca', {'n_components': pca_n_components}))
        pipe_source_3.add(
            PipelineElement.create('svc', {
                'C': svc_c,
                'kernel': svc_kernel
            }))

        # pipeline_fusion = PipelineStacking('multiple_source_pipes',[pipe_source_1, pipe_source_2, pipe_source_3], voting=False)
        pipeline_fusion = PipelineStacking(
            'multiple_source_pipes',
            [pipe_source_1, pipe_source_2, pipe_source_3])

        outer_pipe.add(pipeline_fusion)
        #outer_pipe.add(PipelineElement.create('svc', {'C': svc_c_2, 'kernel': svc_kernel}))
        #outer_pipe.add(PipelineElement.create('knn',{'n_neighbors':[15]}))
        outer_pipe.add(
            PipelineElement.create('kdnn', {
                'target_dimension': [2],
                'nb_epoch': [10]
            }))

        # START HYPERPARAMETER SEARCH
        outer_pipe.fit(self.__X, self.__y)
        print(outer_pipe._test_performances)
        pipe_results = {'train': [], 'test': []}
        for i in range(int(len(outer_pipe._performance_history_list) / 2)):
            pipe_results['train'].extend(
                outer_pipe._performance_history_list[i]['accuracy_folds']
                ['train'])
            pipe_results['test'].extend(outer_pipe._performance_history_list[i]
                                        ['accuracy_folds']['test'])

        print(outer_pipe._test_performances['accuracy'])
Exemplo n.º 9
0
class HyperpipeTests(unittest.TestCase):
    def setUp(self):
        self.pca_pipe_element = PipelineElement.create(
            'pca', {'n_components': [1, 2]}, test_disabled=True)
        self.svc_pipe_element = PipelineElement.create('svc', {
            'C': [0.1, 1],
            'kernel': ['rbf', 'sigmoid']
        })
        self.cv_object = KFold(n_splits=3)
        self.hyperpipe = Hyperpipe('god', self.cv_object)
        self.hyperpipe += self.pca_pipe_element
        self.hyperpipe.add(self.svc_pipe_element)

    def test_init(self):
        self.assertEqual(self.hyperpipe.name, 'god')
        # assure pipeline has two steps, first the pca and second the svc
        self.assertEqual(len(self.hyperpipe._pipe.steps), 2)
        self.assertIs(self.hyperpipe._pipe.steps[0][1], self.pca_pipe_element)
        self.assertIs(self.hyperpipe._pipe.steps[1][1], self.svc_pipe_element)

    def test_hyperparameters(self):
        # hyperparameters
        self.assertDictEqual(
            self.hyperpipe.hyperparameters, {
                'pca': {
                    'n_components': [1, 2],
                    'test_disabled': True
                },
                'svc': {
                    'C': [0.1, 1],
                    'kernel': ['rbf', 'sigmoid']
                }
            })
        # sklearn params
        # Todo: has no sklearn attribute
        # config grid
        # print(self.hyperpipe.config_grid)
        expected_config_grid = [{
            'pca__n_components': 1,
            'pca__disabled': False,
            'svc__C': 0.1,
            'svc__kernel': 'rbf'
        }, {
            'pca__n_components': 1,
            'pca__disabled': False,
            'svc__C': 0.1,
            'svc__kernel': 'sigmoid'
        }, {
            'pca__n_components': 1,
            'pca__disabled': False,
            'svc__C': 1,
            'svc__kernel': 'rbf'
        }, {
            'pca__n_components': 1,
            'pca__disabled': False,
            'svc__C': 1,
            'svc__kernel': 'sigmoid'
        }, {
            'pca__n_components': 2,
            'pca__disabled': False,
            'svc__C': 0.1,
            'svc__kernel': 'rbf'
        }, {
            'pca__n_components': 2,
            'pca__disabled': False,
            'svc__C': 0.1,
            'svc__kernel': 'sigmoid'
        }, {
            'pca__n_components': 2,
            'pca__disabled': False,
            'svc__C': 1,
            'svc__kernel': 'rbf'
        }, {
            'pca__n_components': 2,
            'pca__disabled': False,
            'svc__C': 1,
            'svc__kernel': 'sigmoid'
        }, {
            'pca__disabled': True,
            'svc__C': 0.1,
            'svc__kernel': 'rbf'
        }, {
            'pca__disabled': True,
            'svc__C': 0.1,
            'svc__kernel': 'sigmoid'
        }, {
            'pca__disabled': True,
            'svc__C': 1,
            'svc__kernel': 'rbf'
        }, {
            'pca__disabled': True,
            'svc__C': 1,
            'svc__kernel': 'sigmoid'
        }]
        expected_config_grid = [sorted(i) for i in expected_config_grid]
        actual_config_grid = [sorted(i) for i in self.hyperpipe.config_grid]
        self.assertListEqual(actual_config_grid, expected_config_grid)
Exemplo n.º 10
0
    def testCaseA(self):
        pca_n_components = 10
        svc_c = 1
        svc_kernel = "rbf"
        # SET UP HYPERPIPE
        my_pipe = Hyperpipe('primary_pipe', optimizer='grid_search', optimizer_params={},
                            metrics=['accuracy', 'precision', 'f1_score'],
                            inner_cv=KFold(n_splits=3),
                            outer_cv=KFold(n_splits=3),
                            eval_final_performance=True)

        my_pipe += PipelineElement.create('standard_scaler')
        my_pipe += PipelineElement.create('pca', {'n_components': [pca_n_components]})
        my_pipe += PipelineElement.create('svc', {'C': [svc_c], 'kernel': [svc_kernel]})

        # START HYPERPARAMETER SEARCH
        my_pipe.fit(self.__X, self.__y)
        print(my_pipe._test_performances)
        from Framework import LogExtractor
        log_ex = LogExtractor.LogExtractor(my_pipe.result_tree)
        log_ex.extract_csv("test_case_A.csv")

        # Das muss noch weg! ToDo
        from sklearn.preprocessing import StandardScaler
        from sklearn.decomposition import PCA
        from sklearn.svm import SVC
        from sklearn.pipeline import Pipeline
        from sklearn.metrics import f1_score, accuracy_score, precision_score

        # Now we are using the native Scikit-learn methods
        sk_pipeline = Pipeline([("standard_scaler", StandardScaler()), ("pca", PCA(n_components=pca_n_components)),
                               ("svc", SVC(C=svc_c, kernel=svc_kernel))])

        my_pipe._generate_outer_cv_indices()
        tmp_counter = 0
        for train_idx_arr, test_idx_arr in my_pipe.data_test_cases:

            sk_results = {'accuracy': [], 'precision': [], 'f1_score': [], 'default': []}

            outer_train_X = self.__X[train_idx_arr]
            outer_train_y = self.__y[train_idx_arr]
            outer_test_X = self.__X[test_idx_arr]
            outer_test_y = self.__y[test_idx_arr]

            sk_config_cv = KFold(n_splits=3)
            # Todo: test other configs and select best!
            for sub_train_idx, sub_test_idx in sk_config_cv.split(outer_train_X, outer_train_y):
                inner_train_X = self.__X[sub_train_idx]
                inner_train_y = self.__y[sub_train_idx]
                #test_X = self.__X[sub_test_idx]
                #test_y = self.__y[sub_test_idx]

                # sk_pipeline.fit(inner_train_X, inner_train_y)

                fit_and_predict_score = _fit_and_score(sk_pipeline, outer_train_X, outer_train_y, self.score,
                                                       sub_train_idx, sub_test_idx, verbose=0, parameters={},
                                                       fit_params={},
                                                       return_train_score=True,
                                                       return_n_test_samples=True,
                                                       return_times=True, return_parameters=True,
                                                       error_score='raise')

            sk_pipeline.fit(outer_train_X, outer_train_y)
            sk_prediction = sk_pipeline.predict(outer_test_X)

            sk_results['default'].append(fit_and_predict_score[1])
            sk_results['accuracy'].append(accuracy_score(outer_test_y, sk_prediction))
            sk_results['precision'].append(precision_score(outer_test_y, sk_prediction))
            sk_results['f1_score'].append(f1_score(outer_test_y, sk_prediction))

            # bestItem = np.argmax(sk_results['default'])
            # print([str(k)+':'+str(i[bestItem]) for k, i in sk_results.items()])

            self.assertEqual(sk_results['accuracy'], my_pipe._test_performances['accuracy'][tmp_counter])
            self.assertEqual(sk_results['precision'], my_pipe._test_performances['precision'][tmp_counter])
            self.assertEqual(sk_results['f1_score'], my_pipe._test_performances['f1_score'][tmp_counter])

            tmp_counter += 1
Exemplo n.º 11
0
"""
Test Feature Selection
"""

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold

from Framework.PhotonBase import Hyperpipe, PipelineElement

dataset = load_breast_cancer()
X = dataset.data
y = dataset.target

# create cross-validation object first
cv_object = KFold(n_splits=3, shuffle=True, random_state=0)

# create a hyperPipe
manager = Hyperpipe('god', cv_object, optimizer='random_grid_search')

manager += PipelineElement.create('f_classif_select_percentile',
                                  {'percentile': [10, 20, 30, 100]},
                                  test_disabled=True)

# SVMs (linear and rbf)
manager += PipelineElement.create('svc', {}, kernel='linear')

manager.fit(X, y)
Exemplo n.º 12
0
dataset_files = oasis_dataset.gray_matter_maps
targets = oasis_dataset.ext_vars['age'].astype(float)  # age

# # data
# from sklearn.datasets import load_breast_cancer
# dataset = load_breast_cancer()
# dataset_files = dataset.data
# targets = dataset.target

print(BrainAtlas._getAtlasDict())

# setup photonai HP
my_pipe = Hyperpipe('primary_pipe',
                    optimizer='grid_search',
                    optimizer_params={},
                    metrics=['mean_squared_error', 'mean_absolute_error'],
                    inner_cv=KFold(n_splits=2, shuffle=True, random_state=3),
                    outer_cv=KFold(n_splits=2, shuffle=True, random_state=3),
                    eval_final_performance=True)

my_pipe += PipelineElement.create('SmoothImgs',
                                  {'fwhr': [[8, 8, 8], [12, 12, 12]]})
my_pipe += PipelineElement.create('ResampleImgs', {'voxel_size': [[5, 5, 5]]})

atlas_info = AtlasInfo(atlas_name='mni_icbm152_t1_tal_nlin_sym_09a_mask',
                       mask_threshold=.5,
                       roi_names='all',
                       extraction_mode='vec')
#atlas_info = AtlasInfo(atlas_name='AAL', roi_names='all', extraction_mode='box')
my_pipe += PipelineElement.create('BrainAtlas', {},
                                  atlas_info_object=atlas_info)
Exemplo n.º 13
0
#  -----------> calculate something ------------------- #

# LOAD DATA
dataset = load_breast_cancer()
X = dataset.data
y = dataset.target
print(np.sum(y)/len(y))

from pymodm import connect
connect("mongodb://localhost:27017/photon_db")

# BUILD PIPELINE
manager = Hyperpipe('test_manager',
                    optimizer='timeboxed_random_grid_search', optimizer_params={'limit_in_minutes': 1},
                    outer_cv=ShuffleSplit(test_size=0.2, n_splits=3),
                    inner_cv=KFold(n_splits=10, shuffle=True), best_config_metric='accuracy',
                    metrics=['accuracy', 'precision', 'recall', "f1_score"],
                    logging=False, eval_final_performance=True,
                    calculate_metrics_across_folds=True,
                    verbose=2)

manager.add(PipelineElement.create('standard_scaler', test_disabled=True))
manager += PipelineElement.create('pca', hyperparameters={'n_components': [None, 1, 10000]})
# tmp_lasso = Lasso()
# manager.add(PipelineElement.create('SelectModelWrapper', estimator_obj=tmp_lasso))

svm = PipelineElement.create('svc', hyperparameters={'C': [0.5, 1], 'kernel': ['linear']})
manager.add(svm)
manager.fit(X, y)

#  -----------> Result Tree generated ------------------- #
result_tree = manager.result_tree