Пример #1
0
del user_info_tmp
gc.collect()

# 暴力去掉nan的数据
train = pd.merge(train, shop_info, on=['shop_id'], how='left')
train = train.dropna()
train = pd.merge(train, user_info, on=['index'], how='left')

train['time_stamp'] = pd.to_datetime(train['time_stamp'])
train['current_hour'] = pd.DatetimeIndex(train.time_stamp).hour
# train['current_week'] =  pd.DatetimeIndex(train.time_stamp).dayofweek

train['distance'] = hafuman_km(train['s_longitude'], train['s_latitude'],
                               train['longitude'], train['latitude'])
train['current_bearing_array'] = bearing_array(train.s_latitude.values,
                                               train.s_longitude.values,
                                               train.latitude.values,
                                               train.longitude.values)
# train['distance'] = np.log1p(train['distance'])

# 由于历史记录的意义并没有那么明确,因此将历史转化为和当前的差值处理
train['c_wifi_var'] = train['strength'] - train['c_sw_average']
train['wifi_var'] = train['strength'] - train['s_avg_power']

train['s_wifi_var'] = train['strength'] - train['w_avg_power']
train['sb_wifi_var'] = train['strength'] - train['sb_history_avg_power']

train['angle_var'] = train['history_bearing_array_median'] - train[
    'current_bearing_array']
train['c_sb_wifi_var'] = train['strength'] - train['c_sb_history_avg_power']

train['s_sb_wifi_var_ratio'] = train['c_sb_wifi_var'] / (train['sb_wifi_var'] +
Пример #2
0
def get_one_feature(count_feature):
    dir = './data/'  
    user_behavier = pd.read_csv(dir + 'ccf_first_round_user_shop_behavior.csv')[['longitude','latitude']].reset_index()
    shop_info_tmp = pd.read_csv(dir + 'ccf_first_round_shop_info.csv')[['shop_id','price','longitude','latitude','category_id']]
    # 加入类别
    shop_info_tmp['category_id'] = shop_info_tmp['category_id'].map(lambda x:str(x).split('_')[1])

    shop_info_tmp.rename(columns={'latitude':'s_latitude','longitude':'s_longitude'},inplace=True)
    count_feature_with_shop_price = pd.merge(count_feature,shop_info_tmp,on=['shop_id'],how='left')
    count_feature_with_shop_price_position = pd.merge(count_feature_with_shop_price,user_behavier,on=['index'],how='left')
    del user_behavier;gc.collect()
    del count_feature_with_shop_price;gc.collect()
    # 历史上的距离特征
    count_feature_with_shop_price_position['distance_history'] = hafuman_km(count_feature_with_shop_price_position['s_longitude'],count_feature_with_shop_price_position['s_latitude'],
    																		count_feature_with_shop_price_position['longitude'],count_feature_with_shop_price_position['latitude'])

    count_feature_with_shop_price_position['history_bearing_array'] = bearing_array(count_feature_with_shop_price_position.s_latitude.values, count_feature_with_shop_price_position.s_longitude.values, 
    																				count_feature_with_shop_price_position.latitude.values, count_feature_with_shop_price_position.longitude.values)    

    # 1 shop 的行为范围中位数(测试发现中位数效果好)
    shop_scale = count_feature_with_shop_price_position.groupby(['mall_id','shop_id'],as_index=False).distance_history.agg({'s_median_scale':np.median})
    count_feature = pd.merge(count_feature,shop_scale,on=['mall_id','shop_id'],how='left')
    del shop_scale;gc.collect()
    # 1.2 shop 的行为角度特征,表示历史的方向特征 // 加了当前和历史的差值特征--特征效果较好 同时保留当前特征
    shop_degree = count_feature_with_shop_price_position.groupby(['mall_id','shop_id'],as_index=False).history_bearing_array.agg({'history_bearing_array_median':np.median})
    count_feature = pd.merge(count_feature,shop_degree,on=['mall_id','shop_id'],how='left')
    del shop_degree;gc.collect()

    # 强度特征 均值
    # 2.1.历史商店周围的平均wifi强度 (目的:当前wifi强度 - 历史wifi强度均值)// 
    shop_around_wifi_power = count_feature_with_shop_price_position.groupby(['mall_id','shop_id'],as_index=False).strength.agg({'s_avg_power':np.mean})
    count_feature = pd.merge(count_feature,shop_around_wifi_power,on=['mall_id','shop_id'],how='left')
    del shop_around_wifi_power;gc.collect()

    # 2.1.1 发生链接时的商铺周围的wifi平均强度
    number_count_shop_wifi_strength_c = count_feature[count_feature['connect'] == 1].groupby(['mall_id','shop_id'],as_index=False).strength.agg({'c_sw_average':np.mean})
    count_feature = pd.merge(count_feature,number_count_shop_wifi_strength_c,on=['mall_id','shop_id'],how='left')
    del number_count_shop_wifi_strength_c;gc.collect()

    # 2.2.历史商店和wifi组合时,周围的wifi强度
    shop_bssid_around_wifi_power = count_feature_with_shop_price_position.groupby(['mall_id','bssid','shop_id'],as_index=False).strength.agg({'sb_history_avg_power':np.mean})
    count_feature = pd.merge(count_feature,shop_bssid_around_wifi_power,on=['mall_id','shop_id','bssid'],how='left')
    del shop_bssid_around_wifi_power;gc.collect()
    
    
    # 2.3.历史商店和wifi组合时且连接时的wifi,周围的wifi强度
    shop_bssid_around_wifi_power_c = count_feature_with_shop_price_position[count_feature_with_shop_price_position['connect'] == 1].groupby(['mall_id','bssid','shop_id'],as_index=False).strength.agg({'c_sb_history_avg_power':np.mean})
    count_feature = pd.merge(count_feature,shop_bssid_around_wifi_power_c,on=['mall_id','shop_id','bssid'],how='left')
    del shop_bssid_around_wifi_power_c;gc.collect()
    
    del count_feature_with_shop_price_position;gc.collect()

    # 2.4 商场中 wifi 的强度特征
    wifi_power_feat = count_feature.groupby(['mall_id','bssid'],as_index=False).strength.agg({'w_avg_power':np.mean,'w_std_power':np.std})
    count_feature = pd.merge(count_feature,wifi_power_feat,on=['mall_id','bssid'],how='left')
    del wifi_power_feat;gc.collect()

    # 时间串信息组合
    # count_feature['time_stamp'] = pd.to_datetime(count_feature['time_stamp'])
    # count_feature['history_hour'] =  pd.DatetimeIndex(count_feature.time_stamp).hour
    # count_feature['history_day'] =  pd.DatetimeIndex(count_feature.time_stamp).day

    # day_of_list = max(list(count_feature['history_day'].unique()))
    # print(day_of_list)
    # 构造时间推移特征值
    # for day_index in [1,3,5,7]:
    #     day_count_feature = count_feature[count_feature.history_day >= (day_of_list - day_index + 1)]
    #     day_count_feature_shop_hot = day_count_feature.groupby(['mall_id','shop_id'],as_index=False).strength.count()
    #     day_count_feature_shop_hot.rename(columns={'strength':'%d_shop_hot'%(day_index)},inplace=True)
    #     count_feature = pd.merge(count_feature,day_count_feature_shop_hot,on=['mall_id','shop_id'],how='left')
    #     count_feature = count_feature.fillna(0)
    
    # 链接时发生的特征


    print('make_features')
    # wifi信息统计
    wifi_rank_features = count_feature.groupby(['mall_id','bssid','nature_order'],as_index=False).strength.count()
    wifi_rank_features.rename(columns={'strength':'rank_times'},inplace=True)
    count_feature = pd.merge(count_feature,wifi_rank_features,on=['mall_id','bssid','nature_order'],how='left')


    # 3.2 wifi被链接的次数
    wifi_is_connected_times = count_feature[count_feature['connect'] == 1].groupby(['mall_id','bssid'],as_index=False).strength.count()
    wifi_is_connected_times.rename(columns={'strength':'wifi_is_connected_times'},inplace=True)
    count_feature = pd.merge(count_feature,wifi_is_connected_times,on=['mall_id','bssid'],how='left')
    del wifi_is_connected_times;gc.collect()

    # 3.3 wifi被链接时与商铺发生的次数
    wifi_is_connected_shop_times = count_feature[count_feature['connect'] == 1].groupby(['mall_id','bssid','shop_id'],as_index=False).strength.count()
    wifi_is_connected_shop_times.rename(columns={'strength':'wifi_is_connected_shop_times'},inplace=True)
    count_feature = pd.merge(count_feature,wifi_is_connected_shop_times,on=['mall_id','shop_id','bssid'],how='left')
    del wifi_is_connected_shop_times;gc.collect()

    count_feature['shop_wifi_connect_ratio'] = count_feature['wifi_is_connected_shop_times'] / (count_feature['wifi_is_connected_times'] + 1.0 )

    # 3.wifi覆盖的shop个数
    wifi_cover_count = count_feature.groupby(['mall_id','bssid'],as_index=False).shop_id.apply(lambda x : len(set(x))).reset_index()
    wifi_cover_count.rename(columns={0:'wifi_cover_shop'},inplace=True)
    count_feature = pd.merge(count_feature,wifi_cover_count,on=['mall_id','bssid'],how='left')
    del wifi_cover_count;gc.collect()

    # tfidf-特征 wifi 的tfidf统计特征

    # 3.4 wifi和shop出现的次数
    wifi_shop_count = count_feature.groupby(['mall_id','shop_id','bssid'],as_index=False).strength.count()
    wifi_shop_count.rename(columns={'strength':'wifi_shop_count'},inplace=True)
    count_feature = pd.merge(count_feature,wifi_shop_count,on=['mall_id','shop_id','bssid'],how='left')
    del wifi_shop_count;gc.collect()

    # 3.5 shop有关的wifi个数
    wifi_shop_length = count_feature.groupby(['mall_id','shop_id'],as_index=False).bssid.count()
    wifi_shop_length.rename(columns={'bssid':'wifi_shop_length'},inplace=True)
    count_feature = pd.merge(count_feature,wifi_shop_length,on=['mall_id','shop_id'],how='left')
    del wifi_shop_length;gc.collect()

    count_feature['wifi_shop_ratio_tfidf'] = count_feature['wifi_shop_count'] / (count_feature['wifi_shop_length'] + 1.0)

    # 3.6 bssid个数
    mall_wifi_count = count_feature.groupby(['mall_id','bssid'],as_index=False).strength.count()
    mall_wifi_count.rename(columns={'strength':'mall_wifi_count'},inplace=True)
    count_feature = pd.merge(count_feature,mall_wifi_count,on=['mall_id','bssid'],how='left')
    del mall_wifi_count;gc.collect()

    # 3.7 商铺周围bssid的个数
    shop_around_count = count_feature.groupby(['mall_id','shop_id'],as_index=False).bssid.apply(lambda x : len(set(x))).reset_index()
    shop_around_count.rename(columns={0:'shop_around_count'},inplace=True)
    count_feature = pd.merge(count_feature,shop_around_count,on=['mall_id','shop_id'],how='left')
    del shop_around_count;gc.collect()

    count_feature['shop_around_ration_tfidf'] = count_feature['shop_around_count'] / (count_feature['mall_wifi_count'] + 1)

    count_feature['tfid_features'] = np.log1p(count_feature['shop_around_ration_tfidf']) * count_feature['wifi_shop_ratio_tfidf']

    # count_feature['sun_features'] = count_feature['shop_around_count'] + count_feature['mall_wifi_count'] + count_feature['wifi_shop_count'] + count_feature['wifi_shop_length']
    # 构造集合特征
    
    count_feature = count_feature.fillna(0)
    count_feature.rename(columns={'nature_order':'history_nature_order'},inplace=True)
    return count_feature
sub = pd.merge(sub, shop_info, on=['shop_id'], how='left')
del sub['label']
del shop_info
gc.collect()
# 暴力删除1万个nan数据
sub = sub.dropna()
sub = pd.merge(sub, sub_user_info, on=['row_id'], how='left')
print(sub.head)
sub['time_stamp'] = pd.to_datetime(sub['time_stamp'])
sub['current_hour'] = pd.DatetimeIndex(sub.time_stamp).hour
# sub['current_week'] =  pd.DatetimeIndex(sub.time_stamp).dayofweek

sub['distance'] = hafuman_km(sub['s_longitude'], sub['s_latitude'],
                             sub['longitude'], sub['latitude'])
sub['current_bearing_array'] = bearing_array(sub.s_latitude.values,
                                             sub.s_longitude.values,
                                             sub.latitude.values,
                                             sub.longitude.values)

#sub['distance'] = np.log1p(sub['distance'])
sub['c_wifi_var'] = sub['strength'] - sub['c_sw_average']
sub['wifi_var'] = sub['strength'] - sub['s_avg_power']
sub['s_wifi_var'] = sub['strength'] - sub['w_avg_power']
sub['sb_wifi_var'] = sub['strength'] - sub['sb_history_avg_power']
sub['c_sb_wifi_var'] = sub['strength'] - sub['c_sb_history_avg_power']

sub['angle_var'] = sub['history_bearing_array_median'] - sub[
    'current_bearing_array']

sub['s_sb_wifi_var_ratio'] = sub['c_sb_wifi_var'] / (sub['sb_wifi_var'] +
                                                     0.0001)
def get_one_feature(count_feature):
    dir = '../data/'
    user_behavier = pd.read_csv(dir +
                                'ccf_first_round_user_shop_behavior.csv')[[
                                    'longitude', 'latitude'
                                ]].reset_index()
    shop_info_tmp = pd.read_csv(dir + 'ccf_first_round_shop_info.csv')[[
        'shop_id', 'price', 'longitude', 'latitude', 'category_id'
    ]]
    # 加入类别
    shop_info_tmp['category_id'] = shop_info_tmp['category_id'].map(
        lambda x: str(x).split('_')[1])

    shop_info_tmp.rename(columns={
        'latitude': 's_latitude',
        'longitude': 's_longitude'
    },
                         inplace=True)

    count_feature_with_shop_price = pd.merge(count_feature,
                                             shop_info_tmp,
                                             on=['shop_id'],
                                             how='left')
    #候选特征count_feature中是经过处理的,加入经纬度
    count_feature_with_shop_price_position = pd.merge(
        count_feature_with_shop_price, user_behavier, on=['index'], how='left')

    #|||||||||||||||||||||||||||||||||||||||||| feature engineering |||||||||||||||||||||||||||||||||||||||||||||

    user_shop_behavior = pd.read_csv(
        dir + 'ccf_first_round_user_shop_behavior.csv').reset_index()
    user_merge_shop = pd.merge(user_shop_behavior,
                               shop_info_tmp,
                               on=['shop_id'],
                               how='left')

    ############################### 用户特征 #########################################
    #1.人均消费水平
    people_avg_price = user_merge_shop.groupby(
        ['user_id'], as_index=False).price.agg({'p_avg_price': np.mean})
    count_feature = pd.merge(count_feature,
                             people_avg_price,
                             on=['user_id'],
                             how='left')
    del people_avg_price
    gc.collect()

    ############################### 用户和shop的组合特征 ###############################
    #1.人去店的购物喜好
    people_shop_favor = user_merge_shop.groupby(['user_id', 'shop_id'],
                                                as_index=False).shop_id.agg(
                                                    {'p_shop_favor': 'count'})
    #people_shop_favor.rename(columns={'shop_id':'people_shop_favor'},inplace=True)
    count_feature = pd.merge(count_feature,
                             people_shop_favor,
                             on=['user_id', 'shop_id'],
                             how='left')

    del people_shop_favor
    gc.collect()

    #2.人去店的中位数位置
    people_shop_location_longitude = count_feature.groupby(
        ['user_id', 'shop_id'],
        as_index=False).longitude.agg({'p_shop_location_longitude': np.median})
    people_shop_location_latitude = count_feature.groupby(
        ['user_id', 'shop_id'],
        as_index=False).latitude.agg({'p_shop_location_latitude': np.median})
    count_feature = pd.merge(count_feature,
                             people_shop_location_longitude,
                             on=['user_id', 'shop_id'],
                             how='left')
    count_feature = pd.merge(count_feature,
                             people_shop_location_latitude,
                             on=['user_id', 'shop_id'],
                             how='left')
    del people_shop_location_longitude
    gc.collect()
    del people_shop_location_latitude
    gc.collect()
    count_feature.drop(['longitude', 'latitude', 's_longitude', 's_latitude'],
                       axis=1)

    #3.shop的购物记录数,即shop的人气程度
    shop_hot = user_merge_shop.groupby(
        ['shop_id'], as_index=False).user_id.agg({'s_hot': 'count'})
    count_feature = pd.merge(count_feature,
                             shop_hot,
                             on=['shop_id'],
                             how='left')

    #4.用户与店铺距离特征
    count_feature_with_shop_price_position['distance_history'] = hafuman_km(
        count_feature_with_shop_price_position['s_longitude'],
        count_feature_with_shop_price_position['s_latitude'],
        count_feature_with_shop_price_position['longitude'],
        count_feature_with_shop_price_position['latitude'])

    count_feature_with_shop_price_position[
        'history_bearing_array'] = bearing_array(
            count_feature_with_shop_price_position.s_latitude.values,
            count_feature_with_shop_price_position.s_longitude.values,
            count_feature_with_shop_price_position.latitude.values,
            count_feature_with_shop_price_position.longitude.values)

    count_feature = pd.merge(count_feature,
                             count_feature_with_shop_price_position[['shop']])

    del user_behavier
    gc.collect()
    del count_feature_with_shop_price
    gc.collect()

    ######################################### shop特征 #########################################
    print(count_feature_with_shop_price_position.columns)
    wifirank_mean = count_feature_with_shop_price_position.groupby(
        ['shop_id', 'bssid'],
        as_index=False).nature_order.agg({'r_mean': np.mean})
    count_feature = pd.merge(count_feature,
                             wifirank_mean,
                             on=['shop_id', 'bssid'],
                             how='left')
    count_feature['wifirank_diff'] = count_feature[
        'nature_order'] - count_feature['r_mean']
    #count_feature = count_feature.drop(['r_mean'],axis=1)
    del wifirank_mean
    gc.collect()

    print(count_feature.columns)
    #每个店被连接的WiFi数量
    shop_conncectwifi_count = count_feature[
        count_feature['connect'] == 1].groupby(
            ['shop_id'],
            as_index=False).bssid.agg({'s_conncectwifi_count': 'count'})
    count_feature = pd.merge(count_feature,
                             shop_conncectwifi_count,
                             on=['shop_id'],
                             how='left')
    del shop_conncectwifi_count
    gc.collect()

    # 1 shop 的行为范围中位数(测试发现中位数效果好)
    shop_scale = count_feature_with_shop_price_position.groupby(
        ['mall_id', 'shop_id'],
        as_index=False).distance_history.agg({'s_median_scale': np.median})
    count_feature = pd.merge(count_feature,
                             shop_scale,
                             on=['mall_id', 'shop_id'],
                             how='left')
    del shop_scale
    gc.collect()
    # 1.2 shop 的行为角度特征,表示历史的方向特征 // 加了当前和历史的差值特征--特征效果较好 同时保留当前特征
    shop_degree = count_feature_with_shop_price_position.groupby(
        ['mall_id', 'shop_id'], as_index=False).history_bearing_array.agg(
            {'history_bearing_array_median': np.median})
    count_feature = pd.merge(count_feature,
                             shop_degree,
                             on=['mall_id', 'shop_id'],
                             how='left')
    del shop_degree
    gc.collect()

    # 强度特征 均值
    # 2.1.历史商店周围的平均wifi强度 (目的:当前wifi强度 - 历史wifi强度均值)//
    shop_around_wifi_power = count_feature_with_shop_price_position.groupby(
        ['mall_id', 'shop_id'],
        as_index=False).strength.agg({'s_avg_power': np.mean})
    count_feature = pd.merge(count_feature,
                             shop_around_wifi_power,
                             on=['mall_id', 'shop_id'],
                             how='left')
    del shop_around_wifi_power
    gc.collect()

    # 2.1.1 发生链接时的商铺周围的wifi平均强度
    number_count_shop_wifi_strength_c = count_feature[
        count_feature['connect'] == 1].groupby(['mall_id', 'shop_id'],
                                               as_index=False).strength.agg(
                                                   {'c_sw_average': np.mean})
    count_feature = pd.merge(count_feature,
                             number_count_shop_wifi_strength_c,
                             on=['mall_id', 'shop_id'],
                             how='left')
    del number_count_shop_wifi_strength_c
    gc.collect()

    # 2.2.历史商店和wifi组合时,周围的wifi强度
    shop_bssid_around_wifi_power = count_feature_with_shop_price_position.groupby(
        ['mall_id', 'bssid', 'shop_id'],
        as_index=False).strength.agg({'sb_history_avg_power': np.mean})
    count_feature = pd.merge(count_feature,
                             shop_bssid_around_wifi_power,
                             on=['mall_id', 'shop_id', 'bssid'],
                             how='left')
    del shop_bssid_around_wifi_power
    gc.collect()

    # 2.3.历史商店和wifi组合时且连接时的wifi,周围的wifi强度
    shop_bssid_around_wifi_power_c = count_feature_with_shop_price_position[
        count_feature_with_shop_price_position['connect'] == 1].groupby(
            ['mall_id', 'bssid', 'shop_id'],
            as_index=False).strength.agg({'c_sb_history_avg_power': np.mean})
    count_feature = pd.merge(count_feature,
                             shop_bssid_around_wifi_power_c,
                             on=['mall_id', 'shop_id', 'bssid'],
                             how='left')
    del shop_bssid_around_wifi_power_c
    gc.collect()

    del count_feature_with_shop_price_position
    gc.collect()

    # 2.4 商场中 wifi 的强度特征
    wifi_power_feat = count_feature.groupby(['mall_id', 'bssid'],
                                            as_index=False).strength.agg({
                                                'w_avg_power':
                                                np.mean,
                                                'w_std_power':
                                                np.std
                                            })
    count_feature = pd.merge(count_feature,
                             wifi_power_feat,
                             on=['mall_id', 'bssid'],
                             how='left')
    del wifi_power_feat
    gc.collect()

    # 3.1 wifi被链接的次数
    wifi_is_connected_times = count_feature[
        count_feature['connect'] == 1].groupby(
            ['mall_id', 'bssid'], as_index=False).strength.count()
    wifi_is_connected_times.rename(
        columns={'strength': 'wifi_is_connected_times'}, inplace=True)
    count_feature = pd.merge(count_feature,
                             wifi_is_connected_times,
                             on=['mall_id', 'bssid'],
                             how='left')
    del wifi_is_connected_times
    gc.collect()

    # 3.2 wifi被链接时与商铺发生的次数
    wifi_is_connected_shop_times = count_feature[
        count_feature['connect'] == 1].groupby(
            ['mall_id', 'bssid', 'shop_id'], as_index=False).strength.count()
    wifi_is_connected_shop_times.rename(
        columns={'strength': 'wifi_is_connected_shop_times'}, inplace=True)
    count_feature = pd.merge(count_feature,
                             wifi_is_connected_shop_times,
                             on=['mall_id', 'shop_id', 'bssid'],
                             how='left')
    del wifi_is_connected_shop_times
    gc.collect()

    count_feature['shop_wifi_connect_ratio'] = count_feature[
        'wifi_is_connected_shop_times'] / (
            count_feature['wifi_is_connected_times'] + 1.0)

    # 3.3 wifi覆盖的shop个数
    wifi_cover_count = count_feature.groupby(
        ['mall_id', 'bssid'],
        as_index=False).shop_id.apply(lambda x: len(set(x))).reset_index()
    wifi_cover_count.rename(columns={0: 'wifi_cover_shop'}, inplace=True)
    count_feature = pd.merge(count_feature,
                             wifi_cover_count,
                             on=['mall_id', 'bssid'],
                             how='left')
    del wifi_cover_count
    gc.collect()

    # tfidf-特征 wifi 的tfidf统计特征

    # 3.4 wifi和shop出现的次数
    wifi_shop_count = count_feature.groupby(['mall_id', 'shop_id', 'bssid'],
                                            as_index=False).strength.count()
    wifi_shop_count.rename(columns={'strength': 'wifi_shop_count'},
                           inplace=True)
    count_feature = pd.merge(count_feature,
                             wifi_shop_count,
                             on=['mall_id', 'shop_id', 'bssid'],
                             how='left')
    del wifi_shop_count
    gc.collect()

    # 3.5 shop有关的wifi个数
    wifi_shop_length = count_feature.groupby(['mall_id', 'shop_id'],
                                             as_index=False).bssid.count()
    wifi_shop_length.rename(columns={'bssid': 'wifi_shop_length'},
                            inplace=True)
    count_feature = pd.merge(count_feature,
                             wifi_shop_length,
                             on=['mall_id', 'shop_id'],
                             how='left')
    del wifi_shop_length
    gc.collect()

    count_feature['wifi_shop_ratio_tfidf'] = count_feature[
        'wifi_shop_count'] / (count_feature['wifi_shop_length'] + 1.0)

    # 3.6 bssid个数
    mall_wifi_count = count_feature.groupby(['mall_id', 'bssid'],
                                            as_index=False).strength.count()
    mall_wifi_count.rename(columns={'strength': 'mall_wifi_count'},
                           inplace=True)
    count_feature = pd.merge(count_feature,
                             mall_wifi_count,
                             on=['mall_id', 'bssid'],
                             how='left')
    del mall_wifi_count
    gc.collect()

    # 3.7 商铺周围bssid的个数
    shop_around_count = count_feature.groupby(
        ['mall_id', 'shop_id'],
        as_index=False).bssid.apply(lambda x: len(set(x))).reset_index()
    shop_around_count.rename(columns={0: 'shop_around_count'}, inplace=True)
    count_feature = pd.merge(count_feature,
                             shop_around_count,
                             on=['mall_id', 'shop_id'],
                             how='left')
    del shop_around_count
    gc.collect()

    count_feature['shop_around_ration_tfidf'] = count_feature[
        'shop_around_count'] / (count_feature['mall_wifi_count'] + 1)

    count_feature['tfid_features'] = np.log1p(
        count_feature['shop_around_ration_tfidf']
    ) * count_feature['wifi_shop_ratio_tfidf']

    count_feature[
        'sun_features'] = count_feature['shop_around_count'] + count_feature[
            'mall_wifi_count'] + count_feature[
                'wifi_shop_count'] + count_feature['wifi_shop_length']
    # 构造集合特征

    # 时间串信息组合
    count_feature['time_stamp'] = pd.to_datetime(count_feature['time_stamp'])
    count_feature['history_hour'] = pd.DatetimeIndex(
        count_feature.time_stamp).hour
    count_feature['history_day'] = pd.DatetimeIndex(
        count_feature.time_stamp).day

    count_feature = count_feature.fillna(0)
    count_feature.rename(columns={'nature_order': 'history_nature_order'},
                         inplace=True)
    return count_feature