Пример #1
0
def test_aggregation(load_titanic):
    train, test = load_titanic
    groupby_dict = [
        {
            'key': ['pclass'],
            'var': ['age', 'fare'],
            'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std']
        },
    ]
    nunique_dict = [
        {
            'key': ['pclass'],
            'var': ['sibsp'],
            'agg': ['nunique']
        },
    ]
    train_new, _ = aggregation(train, test, groupby_dict, nunique_dict)
    assert len((set(train_new.columns) - set(train.columns))) > 0
Пример #2
0
def test_aggregation(load_titanic):
    train, test = load_titanic
    groupby_dict = [
        {
            "key": ["pclass"],
            "var": ["age", "fare"],
            "agg": ["mean", "sum", "median", "min", "max", "var", "std"],
        },
    ]
    nunique_dict = [
        {
            "key": ["pclass"],
            "var": ["sibsp"],
            "agg": ["nunique"]
        },
    ]
    train_new, _ = aggregation(train, test, groupby_dict, nunique_dict)
    assert len((set(train_new.columns) - set(train.columns))) > 0
 X_train, X_test = aggregation(
     X_train,
     X_test,
     groupby_dict=[
         {
             'key': ['Sex'],
             'var': numerical_cols,
             'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std']
         },
         {
             'key': ['SibSp'],
             'var': numerical_cols,
             'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std']
         },
         {
             'key': ['Parch'],
             'var': numerical_cols,
             'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std']
         },
         {
             'key': ['Cabin'],
             'var': numerical_cols,
             'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std']
         },
         {
             'key': ['Embarked'],
             'var': numerical_cols,
             'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std']
         },
     ],
     nunique_dict=[
         {
             'key': ['Sex'],
             'var': ['SibSp'],
             'agg': ['nunique']
         },
         {
             'key': ['Sex'],
             'var': ['Cabin'],
             'agg': ['nunique']
         },
     ])
Пример #4
0
    def create(self) -> None:

        if 'count_null' in self.preprocessing.keys():
            with timer('count_null'):
                encode_col = list(self.train.columns)
                encode_col.remove(self.cols_definition['target_col'])
                train, test = count_null(self.train, self.test,
                                         {'encode_col': encode_col})

        if 'label_encoding' in self.preprocessing.keys():
            with timer('label_encoding'):
                self.train, self.test = label_encoding(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'frequency_encoding' in self.preprocessing.keys():
            with timer('frequency_encoding'):
                self.train, self.test = frequency_encoding(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'count_encoding' in self.preprocessing.keys():
            with timer('count_encoding'):
                self.train, self.test = count_encoding(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'count_encoding_interact' in self.preprocessing.keys():
            with timer('count_encoding_interact'):
                self.train, self.test = count_encoding_interact(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'matrix_factorization' in self.preprocessing.keys():
            with timer('matrix_factorization'):
                self.train, self.test = matrix_factorization(
                    self.train, self.test,
                    {'encode_col': self.preprocessing['matrix_factorization']},
                    {
                        'n_components_lda': 5,
                        'n_components_svd': 3
                    })

        if 'target_encoding' in self.preprocessing.keys():
            with timer('target_encoding'):
                self.train, self.test = target_encoding(
                    self.train, self.test, {
                        'encode_col': self.preprocessing['target_encoding'],
                        'target_col': self.cols_definition['target_col']
                    }, {'cv': self.cv})

        if 'aggregation' in self.preprocessing.keys():
            with timer('aggregation'):
                self.train, self.test = aggregation(
                    self.train, self.test, {
                        'groupby_dict':
                        self.preprocessing['aggregation']['groupby_dict'],
                        'nunique_dict':
                        self.preprocessing['aggregation']['nunique_dict']
                    })

        if 'numeric_interact' in self.preprocessing.keys():
            with timer('numeric_interact'):
                self.train, self.test = numeric_interact(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['numerical_col']})

        if 'standerize' in self.preprocessing.keys():
            with timer('standerize'):
                self.train, self.test = standerize(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['numerical_col']})

        if 'get_tfidf' in self.preprocessing.keys():
            with timer('get_tfidf'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_tfidf(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_tfidf'])

        if 'get_count' in self.preprocessing.keys():
            with timer('get_count'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_count(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_count'])

        if 'get_swem_mean' in self.preprocessing.keys():
            with timer('get_swem_mean'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_swem_mean(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_swem_mean'])

        if 'get_bert' in self.preprocessing.keys():
            with timer('get_bert'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_bert(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_bert'])

        with timer('replace inf'):
            self.train = self.train.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)
            self.test = self.test.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)

        with timer('delete cols'):
            unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
                self.train, self.test,
                {'escape_col': self.cols_definition['categorical_col']},
                {'threshold': 0.995})
            self.logger['unique_cols'] = unique_cols
            self.logger['duplicated_cols'] = duplicated_cols
            self.logger['high_corr_cols'] = high_corr_cols
            self.train, self.test = delete_cols(
                self.train, self.test, {
                    'encode_col':
                    unique_cols + duplicated_cols + high_corr_cols +
                    self.cols_definition['delete_col']
                })

        with timer('save'):
            print('train.shape: ', self.train.shape)
            save_as_pickle(self.train, self.test,
                           {'target_col': self.cols_definition['target_col']},
                           {
                               'exp_id': self.run_name,
                               'output_dir': self.output_dir
                           })
Пример #5
0
    def create(self) -> None:

        if 'count_null' in self.preprocessing.keys():
            with timer('count_null'):
                encode_col = list(self.train.columns)
                encode_col.remove(self.cols_definition['target_col'])
                train, test = count_null(self.train, self.test, encode_col)

        if 'label_encoding' in self.preprocessing.keys():
            with timer('label_encoding'):
                self.train, self.test = label_encoding(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'frequency_encoding' in self.preprocessing.keys():
            with timer('frequency_encoding'):
                self.train, self.test = frequency_encoding(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'count_encoding' in self.preprocessing.keys():
            with timer('count_encoding'):
                self.train, self.test = count_encoding(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'count_encoding_interact' in self.preprocessing.keys():
            with timer('count_encoding_interact'):
                self.train, self.test = count_encoding_interact(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'matrix_factorization' in self.preprocessing.keys():
            with timer('matrix_factorization'):
                self.train, self.test = matrix_factorization(
                    self.train,
                    self.test,
                    self.preprocessing['matrix_factorization'],
                    n_components_lda=5,
                    n_components_svd=3)

        if 'target_encoding' in self.preprocessing.keys():
            with timer('target_encoding'):
                self.train, self.test = target_encoding(
                    self.train,
                    self.test,
                    self.preprocessing['target_encoding'],
                    target_col=self.cols_definition['target_col'],
                    cv=self.cv)

        if 'numeric_interact' in self.preprocessing.keys():
            with timer('numeric_interact'):
                self.train, self.test = numeric_interact(
                    self.train, self.test,
                    self.cols_definition['numerical_col'])

        if 'aggregation' in self.preprocessing.keys():
            with timer('aggregation'):
                self.train, self.test = aggregation(
                    self.train,
                    self.test,
                    groupby_dict=self.preprocessing['aggregation']
                    ['groupby_dict'],
                    nunique_dict=self.preprocessing['aggregation']
                    ['nunique_dict'])

        if 'standerize' in self.preprocessing.keys():
            with timer('standerize'):
                self.train, self.test = standerize(
                    self.train, self.test,
                    self.cols_definition['numerical_col'])

        if 'get_tfidf' in self.preprocessing.keys():
            with timer('get_tfidf'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_tfidf(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_tfidf']
                        ['n_components'],
                        lang=self.preprocessing['get_tfidf']['lang'])

        if 'get_count' in self.preprocessing.keys():
            with timer('get_count'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_count(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_count']
                        ['n_components'],
                        lang=self.preprocessing['get_count']['lang'])

        if 'get_swem_mean' in self.preprocessing.keys():
            with timer('get_swem_mean'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_swem_mean(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_swem_mean']
                        ['n_components'],
                        lang=self.preprocessing['get_swem_mean']['lang'])

        if 'get_bert' in self.preprocessing.keys():
            with timer('get_bert'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_bert(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_bert']
                        ['n_components'],
                        lang=self.preprocessing['get_bert']['lang'])

        if 'get_text_len' in self.preprocessing.keys():
            with timer('get_text_len'):
                for tc in self.cols_definition['text_col']:
                    self.train[f'len_{tc}'] = [len(d) for d in self.train[tc]]
                    self.test[f'len_{tc}'] = [len(d) for d in self.test[tc]]

        with timer('replace inf'):
            self.train = self.train.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)
            self.test = self.test.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)

        with timer('delete cols'):
            unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
                self.train,
                self.test,
                escape_col=self.cols_definition['categorical_col'],
                threshold=0.995)
            self.logger['unique_cols'] = unique_cols
            self.logger['duplicated_cols'] = duplicated_cols
            self.logger['high_corr_cols'] = high_corr_cols
            self.train, self.test = delete_cols(
                self.train,
                self.test,
                encode_col=unique_cols + duplicated_cols + high_corr_cols +
                self.cols_definition['delete_col'])

        with timer('save'):
            print('train.shape: ', self.train.shape)
            save_as_pickle(self.train,
                           self.test,
                           target_col=self.cols_definition['target_col'],
                           exp_id=self.run_name,
                           output_dir=self.output_dir)