def howFill(self, df, colums): if colums in cf.get('null_processing_delete'): df = FillMethods.delete(df, colums) elif colums in cf.get('null_processing_assignMean'): df = FillMethods.assignMean(df, colums) elif colums == cf.get('null_processing_property_fee'): df = FillMethods.property_fee(df, colums) # this had finished fillna and data_uniform, so it should do alonely # elif (colums in cf.get('null_processing_assingNumber')) or (colums == cf.get('null_processing_assingNumber_complete_time')): # df = FillMethods.assingNumber(df, colums) elif colums in cf.get('null_processing_assignZero'): df = FillMethods.assignZero(df, colums) elif colums in cf.get('null_processing_assignMode'): df = FillMethods.assignMode(df, colums) elif colums == 'crawl_time': df = self.dateDayNull(df) else: pass return df
def assingNumber(df, null_processing_assingNumber): # 年份与权值的映射 year_map_num_dict = {} for v, k in enumerate( range(cf.get('year_map_num_start'), cf.get('year_map_num_end') + 1)): year_map_num_dict[k] = v # 空值的填充 udf_assignZero = udf(UDFMethods.udf_NULL_assignZero, FloatType()) df_nan = df.filter(df[null_processing_assingNumber].isNull()) df_null = df.filter(df[null_processing_assingNumber] == 'NULL') if df_nan.count() > 0: df = df.na.fill( 0, null_processing_assingNumber) # 为什么填充不了,仍然是空,但不报错????? if df_null.count() > 0: df = df.select( '*', udf_assignZero( df[null_processing_assingNumber]).alias('temp_name')) df = df.drop(null_processing_assingNumber) df = df.withColumnRenamed('temp_name', null_processing_assingNumber) # 值转换 if null_processing_assingNumber in cf.get( 'null_processing_assingNumber'): # floor 值转换 if null_processing_assingNumber == 'floor': udf_floor_assingNumber = udf(UDFMethods.udf_floor, FloatType()) df = df.select( '*', udf_floor_assingNumber( df[null_processing_assingNumber]).alias('temp_name')) df = df.drop(null_processing_assingNumber) df = df.withColumnRenamed('temp_name', null_processing_assingNumber) # decoration 值转换 elif null_processing_assingNumber == 'decoration': udf_decoration_assingNumber = udf(UDFMethods.udf_decoration, FloatType()) df = df.select( '*', udf_decoration_assingNumber( df[null_processing_assingNumber]).alias('temp_name')) df = df.drop(null_processing_assingNumber) df = df.withColumnRenamed('temp_name', null_processing_assingNumber) else: pass return df
def udf_room_type(s): room_type_range = cf.get('uniformity_room_type') try: if s in room_type_range: return s else: return '其他' except Exception: return '其他'
def udf_direction(s): direction_range = cf.get('uniformity_direction') try: if s in list(direction_range.keys()): return direction_range.get(s) else: return '其他' except Exception: return '其他'
def udf_payType(s): uniformity_rent_type_dict = cf.get('uniformity_pay_type') uniformity_rent_type_dict_keys = list(uniformity_rent_type_dict.keys()) try: if s in uniformity_rent_type_dict_keys: return uniformity_rent_type_dict.get(s) else: return '其他' except Exception: return '其他'
def udf_floor_total(s): try: if (s != None) | (s != 'NULL'): s = float(s) if (s < 0) | (s > cf.get('uniformity_floor_total_max')): return float(floor_total_mode) else: return float(s) else: return 0.0 except Exception as e: return 0.0
def trainDataSplit(df): columns_list = df.columns.tolist() columns_list.remove('price') df_X = df[columns_list] df_y = df['price'] train_size = int((df_X.shape[0]) * cf.get('train_size_rate')) X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, train_size=train_size, random_state=4) return X_train, X_test, y_train, y_test
def dataUniform(df): uniformity_fields = cf.get('uniformity_fields') cols = df.columns print('dataUniform==========df.count()') df.count() for column in uniformity_fields: if column in cols: print( 'df.filter(df[colums].isNull()).count()=======data_uniformity_before', column, df.filter(df[column].isNull()).count()) print('column=========', column, df.count()) if column == 'direction': df = DataUniformity.direction(df) elif column == 'floor_total': df = DataUniformity.floorTotal(df) elif column == 'is_broker': df = DataUniformity.isBroker(df) elif column == 'rent_type': df = DataUniformity.rentType(df) # elif column == 'room_type': # df = DataUniformity.roomType(df) elif column == 'pay_type': df = DataUniformity.payType(df) elif column in ['price', 'score', 'house_count', 'area']: df = Math.XiGeMa(df, column, 3) elif column == 'agency_name': df = DataUniformity.agencyName(df) elif column == 'zone': df = DataUniformity.zone(df) else: print('the feature need not be processed') print( 'df.filter(df[colums].isNull()).count()=======data_uniformity_after', column, df.filter(df[column].isNull()).count()) df.select(column).show() else: pass return df
def oneHotAll(df): onHotFields = cf.get('null_no_processing') columns = df.columns onHotFields = list(set(onHotFields) & set(columns)) # if len(onHotFields) TODO sdf = oneHot(df, "id", onHotFields[0]) for i in onHotFields[1:]: tmp_sdf = oneHot(df, "id", i) sdf = sdf.join(tmp_sdf, on='id', how='inner') for j in onHotFields: columns.remove(j) numerice_sdf = df.select(columns) total_df = numerice_sdf.join(sdf, 'id', 'inner') del numerice_sdf del sdf return total_df