Пример #1
0
class TestFeatWrapper(unittest.TestCase):
    def setUp(self):
        self.v = verbosity
        self.clf = Feat(verbosity=verbosity, n_threads=1)
        diabetes = load_diabetes()
        self.X = diabetes.data
        self.y = diabetes.target

    #Test 1: Assert the length of labels returned from predict
    def test_predict_length(self):
        self.debug("Fit the Data")
        self.clf.fit(self.X, self.y)

        self.debug("Predicting the Results")
        pred = self.clf.predict(self.X)

        self.debug("Comparing the Length of labls in Predicted vs Actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual(actual_length, expected_length)

    #Test 2:  Assert the length of labels returned from fit_predict
    def test_fitpredict_length(self):
        self.debug("Calling fit_predict from Feat")
        pred = self.clf.fit_predict(self.X, self.y)

        self.debug("Comparing the length of labls in fit_predict vs actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual(actual_length, expected_length)

    #Test 3:  Assert the length of labels returned from transform
    def test_transform_length(self):
        self.debug("Calling fit")
        self.clf.fit(self.X, self.y)
        trans_X = self.clf.transform(self.X)

        self.debug(
            "Comparing the length of labls in transform vs actual feature set "
        )
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual(actual_value, expected_value)

    #Test 4:  Assert the length of labels returned from fit_transform
    def test_fit_transform_length(self):
        self.debug("In wrappertest.py...Calling fit transform")
        trans_X = self.clf.fit_transform(self.X, self.y)

        self.debug(
            "Comparing the length of labls in transform vs actual feature set "
        )
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual(actual_value, expected_value)

    #Test 5:  Transform with Z
    def test_transform_length_z(self, zfile=None, zids=None):
        self.debug("Calling fit")
        self.clf.fit(self.X, self.y)
        trans_X = self.clf.transform(self.X, zfile, zids)

        self.debug(
            "Comparing the length of labls in transform vs actual feature set "
        )
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual(actual_value, expected_value)

    def debug(self, message):
        if (self.v > 0):
            print(message)

    def test_coefs(self):
        self.debug("In wrappertest.py...Calling test_coefs")
        self.clf.fit(self.X, self.y)
        coefs = self.clf.get_coefs()
        self.assertTrue(len(coefs) > 0)

    def test_dataframe(self):
        self.debug("In wrappertest.py...Calling test_dataframe")
        dfX = pd.DataFrame(
            data=self.X,
            columns=['fishy' + str(i) for i in np.arange(self.X.shape[1])],
            index=None)
        dfy = pd.DataFrame(data={'label': self.y})

        self.clf.fit(dfX, dfy['label'])
        assert (self.clf.feature_names == ','.join(dfX.columns).encode())

    #Test: Assert the length of labels returned from predict
    def test_predict_stats_length(self):
        self.debug("Fit the Data")
        self.clf.fit(self.X, self.y)

        for key in self.clf.stats:
            self.assertEqual(len(self.clf.stats[key]), self.clf.gens)

    #Test ability to pickle feat model
    def test_pickling(self):
        self.debug("Pickle Feat object")

        with open('test_pickle.pkl', 'wb') as f:
            pickle.dump(self.clf, f)

        with open('test_pickle.pkl', 'rb') as f:
            loaded_clf = pickle.load(f)

        assert (loaded_clf.get_params() == self.clf.get_params())

    def test_archive(self):
        """test archiving ability"""
        self.debug("Test archive")

        self.clf.classification = True
        self.clf.ml = b'LR'
        self.clf.fit(self.X, np.array(self.y > np.median(self.y),
                                      dtype=np.int))
        archive = self.clf.get_archive()
        preds = self.clf.predict_archive(self.X)
        probs = self.clf.predict_proba_archive(self.X)

        for arch, pred, prob in zip(archive, preds, probs):
            self.assertTrue(arch['id'] == pred['id'])
            self.assertTrue(arch['id'] == prob['id'])

    def test_lr_l1(self):
        """testing l1 penalized LR"""
        self.clf.classification = True
        self.clf.ml = b'L1_LR'
        self.clf.fit(self.X, np.array(self.y > np.median(self.y),
                                      dtype=np.int))

        self.assertEqual(len(self.clf.predict(self.X)), len(self.y))
Пример #2
0
    def train_predict(self, data, time_budget, n_class, schema):
        s1 = time.time()
        seed = SEED
        fix_seed(seed)
        LOGGER.info(f'time_budget:{time_budget}')
        LOGGER.info(f'n_class:{n_class}')
        LOGGER.info(f'node:{data["fea_table"].shape[0]}')
        LOGGER.info(f'edge:{data["edge_file"].shape[0]}')

        #pre-process data
        process_data = ProcessData(data)
        table = process_data.pre_process(time_budget, n_class, schema)

        # Feature Dimension Reduction
        feat = Feat()

        process_data.drop_unique_columns(table)
        drop_sum_columns = process_data.drop_excessive_columns(table)

        feat.fit_transform(table, drop_sum_columns)
        LOGGER.info(
            f'train:test={(table.df["is_test"]!=1).sum()}:{(table.df["is_test"]==1).sum()}'
        )

        #这里好像没用到哦
        table.large_features = False
        if table.ori_columns.shape[0] > 500:
            table.large_features = True

        model_type_list = ['sage', 'gat', 'tagc', 'gcn']

        repeat = 3
        model_name_list = [
            f'{model_type_list[i]}{i+len(model_type_list)*j}'
            for j in range(repeat) for i in range(len(model_type_list))
        ]
        model_type_list = model_type_list * repeat

        LOGGER.info('use node embedding')
        categories = [
            'node_index', 'degree_bins', 'bin_2-neighbor_mean_degree_bins'
        ]

        for model in set(model_type_list):
            LOGGER.info(
                f"""{model} feature num:{eval(f'table.{model}_columns.shape[0]')}"""
            )
            exec(
                f'table.{model}_data = process_data.process_gnn_data(table,table.{model}_columns,categories)'
            )

        allmodel = AllModel()

        table.lr_epoch = 16

        table.lr_list = [0.05, 0.03, 0.01, 0.0075, 0.005, 0.003, 0.001, 0.0005]

        train_valid_idx_list, valid_idx_list = split_train_and_valid(
            table, train_rate=0.8, seed=SEED, mode=split_mode)
        train_idx, test_idx = split_train_and_test(table)

        test_idx = test_idx.sort_values()
        run_model = []
        run_type = []
        run_time = {}
        for i in range(len(model_type_list)):
            seed = SEED * (i + 1)
            fix_seed(seed)
            model_type = model_type_list[i]
            model_name = model_name_list[i]
            if model_type not in run_time:
                init_time, one_epoch_time, early_stopping_rounds = allmodel.get_run_time(
                    table,
                    model_type,
                    model_name,
                    train_idx,
                    test_idx,
                    seed=seed)
                run_lr_time = len(table.lr_list) * (
                    init_time + table.lr_epoch * one_epoch_time)
                run_time500 = init_time * (2) + one_epoch_time * (
                    500 + early_stopping_rounds) * 2 + run_lr_time
                run_time300 = init_time * (2) + one_epoch_time * (
                    300 + early_stopping_rounds) * 2 + run_lr_time
                run_time150 = init_time * (2) + one_epoch_time * (
                    150 + early_stopping_rounds) * 2 + run_lr_time
                run_time[model_type] = (run_time500 - run_lr_time,
                                        run_time300 - run_lr_time,
                                        run_time150 - run_lr_time,
                                        early_stopping_rounds, init_time,
                                        one_epoch_time, run_lr_time)
            else:
                run_time500, run_time300, run_time150, early_stopping_rounds, init_time, one_epoch_time, run_lr_time = run_time[
                    model_type]
            s2 = time.time()
            LOGGER.info(
                f"time_budget:{time_budget}s,used time:{s2-s1:.2f}s,{model_name} model will use {run_time500:.2f}s|{run_time300:.2f}s|{run_time150:.2f}s"
            )
            if s2 - s1 + run_time500 + 5 < time_budget:
                LOGGER.info('train 500 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=500,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif s2 - s1 + run_time300 + 5 < time_budget:
                LOGGER.info('train 300 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=300,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif s2 - s1 + run_time150 + 5 < time_budget:
                LOGGER.info('train 150 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=150,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif len(allmodel.valid_models[0]) == 0:
                this_epoch = int((
                    (time_budget -
                     (s2 - s1 + 5) - run_lr_time) / 2 - init_time) /
                                 (one_epoch_time) - early_stopping_rounds)
                LOGGER.info(f'short time train {this_epoch} epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=this_epoch,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif time_budget - (s2 - s1) < 5:
                LOGGER.info('never train; break')
                break
            else:
                LOGGER.info('no train this model; continue')
                continue

        if offline:
            if table.especial:
                df = table.df[['node_index', 'is_test']]
                df = df.merge(data['test_label'], how='left', on='node_index')
                test_label = df.loc[(df['is_test'] == 1) &
                                    (table.directed_mask.tolist()),
                                    'label'].astype('int').values
            else:
                test_label = data['test_label']['label'].values
        else:
            test_label = None

        preds1, valid_acc1 = get_preds(0, run_model, run_type, allmodel,
                                       model_name_list, table, test_label,
                                       valid_idx_list)
        preds2, valid_acc2 = get_preds(1, run_model, run_type, allmodel,
                                       model_name_list, table, test_label,
                                       valid_idx_list)
        preds = (preds1 + preds2) / 2

        preds = preds.argmax(axis=1).flatten()

        if table.especial:
            LOGGER.info(f'preds\n{preds}')
            df = table.df[['label', 'is_test']]
            df['preds'] = int(
                df.loc[[not i for i in table.directed_mask.tolist()],
                       'label'].value_counts().index[0])
            df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                   'preds'] = preds
            preds = df.loc[df['is_test'] == 1, 'preds'].values

        LOGGER.info(
            f"train label\n{data['train_label']['label'].value_counts()/data['train_label'].shape[0]}"
        )
        df_preds = pd.Series(preds, name='preds')
        LOGGER.info(
            f"preds label\n{df_preds.value_counts()/df_preds.shape[0]}")

        if offline:
            preds1 = preds1.argmax(axis=1).flatten()
            preds2 = preds2.argmax(axis=1).flatten()
            if table.especial:
                LOGGER.info(f'preds1\n{preds1}')
                df = table.df[['label', 'is_test']]
                df['preds'] = int(
                    df.loc[[not i for i in table.directed_mask.tolist()],
                           'label'].value_counts().index[0])
                df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                       'preds'] = preds1
                preds1 = df.loc[df['is_test'] == 1, 'preds'].values

                LOGGER.info(f'preds2\n{preds2}')
                df = table.df[['label', 'is_test']]
                df['preds'] = int(
                    df.loc[[not i for i in table.directed_mask.tolist()],
                           'label'].value_counts().index[0])
                df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                       'preds'] = preds2
                preds2 = df.loc[df['is_test'] == 1, 'preds'].values

            df_test = table.df[['degree', 'label', 'is_test']]
            df_test = df_test.loc[df_test['is_test'] == 1]
            df_test['preds'] = preds
            df_test['label'] = data['test_label']['label'].values
            df_test['acc'] = df_test['preds'] == df_test['label']

            pd.set_option('display.max_rows', 1000)
            print(df_test.groupby('degree')['acc'].mean())

            return preds, valid_acc1, valid_acc2, preds1, preds2
        else:
            return preds
Пример #3
0
class TestFeatWrapper(unittest.TestCase):

    def setUp(self):
        self.v = verbosity
        self.clf = Feat(verbosity=self.v)
        diabetes = load_diabetes()
        self.X = diabetes.data
        self.y = diabetes.target
        
    #Test 1: Assert the length of labels returned from predict
    def test_predict_length(self):
        self.debug("Fit the Data")
        self.clf.fit(self.X,self.y)

        self.debug("Predicting the Results")
        pred = self.clf.predict(self.X)

        self.debug("Comparing the Length of labls in Predicted vs Actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual( actual_length , expected_length )

    #Test 2:  Assert the length of labels returned from fit_predict
    def test_fitpredict_length(self):
        self.debug("Calling fit_predict from Feat")
        pred = self.clf.fit_predict(self.X,self.y)

        self.debug("Comparing the length of labls in fit_predict vs actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual( actual_length , expected_length )

    #Test 3:  Assert the length of labels returned from transform
    def test_transform_length(self):
        self.debug("Calling fit")
        self.clf.fit(self.X,self.y)
        trans_X = self.clf.transform(self.X)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )

    #Test 4:  Assert the length of labels returned from fit_transform
    def test_fit_transform_length(self):
        self.debug("In wrappertest.py...Calling fit transform")
        trans_X = self.clf.fit_transform(self.X,self.y)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )
        
    #Test 5:  Transform with Z
    def test_transform_length_z(self,zfile=None,zids=None):
        self.debug("Calling fit")
        self.clf.fit(self.X,self.y)
        trans_X = self.clf.transform(self.X,zfile,zids)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )

    def debug(self,message):
        if ( self.v > 0 ):
            print (message)

    def test_coefs(self):
        self.debug("In wrappertest.py...Calling test_coefs")
        self.clf.fit(self.X,self.y)
        coefs = self.clf.get_coefs()
        print('coefs:',coefs)
        self.assertTrue( len(coefs)>0 )
Пример #4
0
class TestFeatWrapper(unittest.TestCase):

    def setUp(self):
        self.v = verbosity
        self.clf = Feat(verbosity=verbosity, n_threads=1)
        diabetes = load_diabetes()
        self.X = diabetes.data
        self.y = diabetes.target
        
    #Test 1: Assert the length of labels returned from predict
    def test_predict_length(self):
        self.debug("Fit the Data")
        self.clf.fit(self.X,self.y)

        self.debug("Predicting the Results")
        pred = self.clf.predict(self.X)

        self.debug("Comparing the Length of labls in Predicted vs Actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual( actual_length , expected_length )

    #Test 2:  Assert the length of labels returned from fit_predict
    def test_fitpredict_length(self):
        self.debug("Calling fit_predict from Feat")
        pred = self.clf.fit_predict(self.X,self.y)

        self.debug("Comparing the length of labls in fit_predict vs actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual( actual_length , expected_length )

    #Test 3:  Assert the length of labels returned from transform
    def test_transform_length(self):
        self.debug("Calling fit")
        self.clf.fit(self.X,self.y)
        trans_X = self.clf.transform(self.X)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )

    #Test 4:  Assert the length of labels returned from fit_transform
    def test_fit_transform_length(self):
        self.debug("In wrappertest.py...Calling fit transform")
        trans_X = self.clf.fit_transform(self.X,self.y)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )
        
    #Test 5:  Transform with Z
    def test_transform_length_z(self,zfile=None,zids=None):
        self.debug("Calling fit")
        self.clf.fit(self.X,self.y)
        trans_X = self.clf.transform(self.X,zfile,zids)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )

    def debug(self,message):
        if ( self.v > 0 ):
            print (message)

    def test_coefs(self):
        self.debug("In wrappertest.py...Calling test_coefs")
        self.clf.fit(self.X,self.y)
        coefs = self.clf.get_coefs()
        print('coefs:',coefs)
        self.assertTrue( len(coefs)>0 )

    def test_dataframe(self):
        self.debug("In wrappertest.py...Calling test_dataframe")
        dfX = pd.DataFrame(data=self.X,columns=['fishy'+str(i) 
                                        for i in np.arange(self.X.shape[1])],
                                        index=None)
        # print(dfX.head())
        # print('dfX.columns:',dfX.columns)
        dfy = pd.DataFrame(data={'label':self.y})

        self.clf.fit(dfX,dfy['label'])
        # print('clf feature_names:',self.clf.feature_names)
        # print('dfX.columns:',','.join(dfX.columns).encode())
        assert(self.clf.feature_names == ','.join(dfX.columns).encode())

    #Test: Assert the length of labels returned from predict
    def test_predict_stats_length(self):
        self.debug("Fit the Data")
        self.clf.fit(self.X,self.y)

        print("Num generations is ", self.clf.gens)
        for key in self.clf.stats:
            print("Length for ", key, "is ", len(self.clf.stats[key]))
            self.assertEqual(len(self.clf.stats[key]), self.clf.gens)