예제 #1
0
    def get_processing(self):
        # 最终处理数据生成特征的函数
        dataset = self._get_data_plusaugmentation()

        boolean_columns = self._get_boolean_columns(dataset)
        remove_columns = ['id', 'score']
        num_columns = list()  # 摘取其余的特征
        for column in dataset.columns:
            if column in remove_columns:
                continue
            if column in boolean_columns:
                continue
            num_columns.append(column)

        # 异常字段处理:手动分箱
        abnormal_columns = [
            'online_shopping_num', 'logistics_num', 'financing_num',
            'video_num', 'airplant_num', 'train_num', 'travel_num'
        ]
        abnormal_encoder_columns = list()
        for column in abnormal_columns:
            encoder_column = f'{column}_encoder'
            dataset[encoder_column] = dataset[column].apply(
                self._get_abnormal_label)
            abnormal_encoder_columns.append(encoder_column)
        dataset = pd.get_dummies(dataset, columns=abnormal_encoder_columns)
        #_get_abnormal_label函数只是把数据变成1234  但是要通过get_dummies变成哑变量0001

        # 缺失值单独抽离特征:无效  这怎么可能有效啊!无语子
        # 那既然这样无效的话 用什么办法处理缺失值呢?
        # 是认为 除了boolean特征和刚刚处理过的app使用次数特征之外 其余的0值就是missing data啦
        # for column in num_columns:
        #     # abnormal已处理过,continue
        #     if column in abnormal_columns:
        #         continue
        #     column_name = f'{column}_missing'
        #     dataset[column_name] = dataset[column].apply(self._get_missing_value)

        # 将bool类型重新组合 也就是交互特征啦
        # 生成的特征有点过多了 可以考虑去掉
        # dataset = self._recombine_boolean_columns(dataset, boolean_columns)

        # ??看看是啥特征 没看懂是在干嘛
        # dataset = self._data_encoder(dataset, ['surfing_time', 'age'])

        # 业务逻辑特征
        dataset = self._get_operation_features(dataset)

        if self.selector:
            train_data = dataset[dataset['score'] > 0]
            y_data = train_data['score']
            x_data = train_data.drop(columns=['id', 'score'])
            # 选特征
            select_features = Selector(
                ascending=self.ascending).get_select_features(x_data, y_data)
            select_features.extend(['id', 'score'])
            dataset = dataset[select_features]

        return dataset
예제 #2
0
    def get_processing(self):
        dataset = self._get_data()

        boolean_columns = self._get_boolean_columns(dataset)
        remove_columns = ['id', 'score']
        num_columns = list()
        for column in dataset.columns:
            if column in remove_columns:
                continue
            if column in boolean_columns:
                continue
            num_columns.append(column)

        #异常字段处理:手动分箱
        abnormal_columns = [
            'online_shopping_num', 'logistics_num', 'financing_num',
            'video_num', 'airplant_num', 'train_num', 'travel_num'
        ]
        abnormal_encoder_columns = list()
        for column in abnormal_columns:
            encoder_column = f'{column}_encoder'
            dataset[encoder_column] = dataset[column].apply(
                self._get_abnormal_label)
            abnormal_encoder_columns.append(encoder_column)
        dataset = pd.get_dummies(dataset, columns=abnormal_encoder_columns)

        #缺失值单独抽离特征:无效
        # for column in num_columns:
        #     if column in abnormal_columns:
        #         continue
        #     column_name = f'{column}_missing'
        #     dataset[column_name] = dataset[column].apply(self._get_missing_value)

        #将bool类型重新组合
        dataset = self._recombine_boolean_columns(dataset, boolean_columns)

        #embedding
        #dataset = self._data_encoder(dataset,['surfing_time','age'])

        #业务逻辑特征
        dataset = self._get_operation_features(dataset)

        if self.selector:
            train_data = dataset[dataset['score'] > 0]
            y_data = train_data['score']
            x_data = train_data.drop(columns=['id', 'score'])
            #selects = Selector(ascending=self.ascending)
            select_features = Selector(
                ascending=self.ascending).get_select_features(x_data, y_data)
            select_features.extend(['id', 'score'])
            dataset = dataset[select_features]

        return dataset