def get_processing(self): # 最终处理数据生成特征的函数 dataset = self._get_data_plusaugmentation() boolean_columns = self._get_boolean_columns(dataset) remove_columns = ['id', 'score'] num_columns = list() # 摘取其余的特征 for column in dataset.columns: if column in remove_columns: continue if column in boolean_columns: continue num_columns.append(column) # 异常字段处理:手动分箱 abnormal_columns = [ 'online_shopping_num', 'logistics_num', 'financing_num', 'video_num', 'airplant_num', 'train_num', 'travel_num' ] abnormal_encoder_columns = list() for column in abnormal_columns: encoder_column = f'{column}_encoder' dataset[encoder_column] = dataset[column].apply( self._get_abnormal_label) abnormal_encoder_columns.append(encoder_column) dataset = pd.get_dummies(dataset, columns=abnormal_encoder_columns) #_get_abnormal_label函数只是把数据变成1234 但是要通过get_dummies变成哑变量0001 # 缺失值单独抽离特征:无效 这怎么可能有效啊!无语子 # 那既然这样无效的话 用什么办法处理缺失值呢? # 是认为 除了boolean特征和刚刚处理过的app使用次数特征之外 其余的0值就是missing data啦 # for column in num_columns: # # abnormal已处理过,continue # if column in abnormal_columns: # continue # column_name = f'{column}_missing' # dataset[column_name] = dataset[column].apply(self._get_missing_value) # 将bool类型重新组合 也就是交互特征啦 # 生成的特征有点过多了 可以考虑去掉 # dataset = self._recombine_boolean_columns(dataset, boolean_columns) # ??看看是啥特征 没看懂是在干嘛 # dataset = self._data_encoder(dataset, ['surfing_time', 'age']) # 业务逻辑特征 dataset = self._get_operation_features(dataset) if self.selector: train_data = dataset[dataset['score'] > 0] y_data = train_data['score'] x_data = train_data.drop(columns=['id', 'score']) # 选特征 select_features = Selector( ascending=self.ascending).get_select_features(x_data, y_data) select_features.extend(['id', 'score']) dataset = dataset[select_features] return dataset
def get_processing(self): dataset = self._get_data() boolean_columns = self._get_boolean_columns(dataset) remove_columns = ['id', 'score'] num_columns = list() for column in dataset.columns: if column in remove_columns: continue if column in boolean_columns: continue num_columns.append(column) #异常字段处理:手动分箱 abnormal_columns = [ 'online_shopping_num', 'logistics_num', 'financing_num', 'video_num', 'airplant_num', 'train_num', 'travel_num' ] abnormal_encoder_columns = list() for column in abnormal_columns: encoder_column = f'{column}_encoder' dataset[encoder_column] = dataset[column].apply( self._get_abnormal_label) abnormal_encoder_columns.append(encoder_column) dataset = pd.get_dummies(dataset, columns=abnormal_encoder_columns) #缺失值单独抽离特征:无效 # for column in num_columns: # if column in abnormal_columns: # continue # column_name = f'{column}_missing' # dataset[column_name] = dataset[column].apply(self._get_missing_value) #将bool类型重新组合 dataset = self._recombine_boolean_columns(dataset, boolean_columns) #embedding #dataset = self._data_encoder(dataset,['surfing_time','age']) #业务逻辑特征 dataset = self._get_operation_features(dataset) if self.selector: train_data = dataset[dataset['score'] > 0] y_data = train_data['score'] x_data = train_data.drop(columns=['id', 'score']) #selects = Selector(ascending=self.ascending) select_features = Selector( ascending=self.ascending).get_select_features(x_data, y_data) select_features.extend(['id', 'score']) dataset = dataset[select_features] return dataset