Пример #1
0
def run():
    csv_data_all = pd.read_csv(
        com.get_project_path('Data/Csv/ClnData/csv_data_all.csv'))
    csv_data_item = pd.read_csv(
        com.get_project_path('Data/Csv/ClnData/csv_data_item.csv'))
    csv_data_p = pd.read_csv(
        com.get_project_path('Data/Csv/ClnData/csv_data_p.csv'))

    csv_data_all['ui_id'] = sp.get_ui_id(csv_data_all)
    csv_data_p['ui_id'] = sp.get_ui_id(csv_data_p)
    csv_data_all['uc_id'] = sp.get_uc_id(csv_data_all)
    csv_data_p['uc_id'] = sp.get_uc_id(csv_data_p)

    get_feature(data_all=csv_data_all,
                data_p=csv_data_p,
                data_item=csv_data_item,
                label_day_rank=31,
                p_only=False,
                duration=31,
                save=True)
    get_feature(data_all=csv_data_all,
                data_p=csv_data_p,
                data_item=csv_data_item,
                label_day_rank=32,
                p_only=True,
                duration=31,
                save=True)
Пример #2
0
def run():
    train_x = pd.read_csv(
        com.get_project_path(
            'Data/Csv/FeaData/_A/fea_all_label31_dur31_sl1.csv'))
    train_y_ui = sp.get_csv_label(
        pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all.csv')),
        31)

    print('特征数量: ' + str(len(train_x.columns) - 2))
    print('训练集数量: ' + str(len(train_x)))

    train_y = sp.get_ui_id(train_x).isin(sp.get_ui_id(train_y_ui)).replace({
        True:
        1,
        False:
        0
    })
    train_x = train_x.replace({np.inf: 1})

    rfe = RFE(estimator=XGBClassifier(n_estimators=10,
                                      learning_rate=0.05,
                                      max_depth=5,
                                      colsample_bytree=0.8,
                                      subsample=0.8,
                                      min_child_weight=16),
              n_features_to_select=30)
    rfe.fit(train_x.drop(['user_id', 'item_id'], axis=1), train_y)

    result = pd.DataFrame(sorted(
        zip(map(lambda x: round(x, 4), rfe.ranking_),
            train_x.drop(['user_id', 'item_id'], axis=1).columns)),
                          columns=['score', 'feature'])
    result.to_csv(com.get_project_path('Data/Temp/feature_rfe_.csv'),
                  index=None)
    print(result)
Пример #3
0
def run():
    csv_data_item = pd.read_csv(com.get_project_path(
        'Data/Csv/OriData/tianchi_fresh_comp_train_item.csv'),
                                header=0,
                                names=['item_id', 'item_geo', 'item_cate'])
    csv_data_user = pd.read_csv(com.get_project_path(
        'Data/Csv/OriData/tianchi_fresh_comp_train_user.csv'),
                                header=0,
                                names=[
                                    'user_id', 'item_id', 'beh_type',
                                    'user_geo', 'item_cate', 'time'
                                ])

    # 多此一举
    csv_data_all = pd.merge(csv_data_user,
                            csv_data_item.loc[:,
                                              ['item_id']].drop_duplicates(),
                            how='left',
                            on='item_id')
    csv_data_all.to_csv(
        com.get_project_path('Data/Csv/OriData/csv_data_all.csv'), index=None)

    # 保存1w条做来测试代码
    csv_data_all.head(10000).to_csv(
        com.get_project_path('Data/Csv/OriData/csv_data_all_h1w.csv'),
        index=None)
Пример #4
0
def run():
    csv_data_all = pd.read_csv(com.get_project_path('Data/Csv/OriData/csv_data_all.csv'))
    csv_data_item = pd.read_csv(com.get_project_path('Data/Csv/OriData/tianchi_fresh_comp_train_item.csv'), header=0, names=['item_id', 'item_geo', 'item_cate'])
    # 测试代码时解注下面一条
    # csv_data_all = pd.read_csv(com.get_project_path('Data/Csv/OriData/csv_data_all_h1w.csv'))

    # 处理time
    csv_data_all['time'] = pd.to_datetime(csv_data_all['time'], format='%Y%m%d %H')
    csv_data_all['hour'] = csv_data_all['time'].dt.hour
    csv_data_all['time'] = csv_data_all['time'].dt.normalize()
    csv_data_all['week'] = csv_data_all['time'].apply(lambda a: a.weekday()+1)
    csv_data_all['day_rank'] = csv_data_all['time'].rank(method='dense').apply(lambda a: int(a))
    # del csv_data_all['time']

    # 处理经纬度
    csv_data_item['item_geo'] = csv_data_item['item_geo'].replace('input_data_is_error', '').fillna('').apply(lambda a: gh64.decode(a))
    csv_data_item['item_geo_lat'] = csv_data_item['item_geo'].apply(lambda a: get_lat_lon(a, 0, inplace=-90))
    csv_data_item['item_geo_lon'] = csv_data_item['item_geo'].apply(lambda a: get_lat_lon(a, 1, inplace=180))
    del csv_data_item['item_geo']
    csv_data_all['user_geo'] = csv_data_all['user_geo'].replace('input_data_is_error', '').fillna('').apply(lambda a: gh64.decode(a))
    csv_data_all['user_geo_lat'] = csv_data_all['user_geo'].apply(lambda a: get_lat_lon(a, 0, inplace=90))
    csv_data_all['user_geo_lon'] = csv_data_all['user_geo'].apply(lambda a: get_lat_lon(a, 1, inplace=-180))
    del csv_data_all['user_geo']

    # 保存
    com.save_csv(csv_data_all.sort_values(by=['user_id', 'day_rank', 'item_id', 'beh_type']), com.get_project_path('Data/Csv/ClnData/'), 'csv_data_all.csv')
    com.save_csv(csv_data_item.sort_values(by=['item_id', 'item_cate']), com.get_project_path('Data/Csv/ClnData/'), 'csv_data_item.csv')
    com.save_csv(csv_data_all[csv_data_all['item_id'].isin(csv_data_item['item_id'])].sort_values(by=['user_id', 'day_rank', 'item_id', 'beh_type']),
                 com.get_project_path('Data/Csv/ClnData/'), 'csv_data_p.csv')

    # 保存1w条做来测试代码
    csv_data_all.head(10000).sort_values(by=['user_id', 'day_rank', 'item_id']).to_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all_h1w.csv'), index=None)
def run():
    global train_y_ui
    global test_y_ui
    global train_ui
    global test_ui

    train_x = pd.read_csv(
        com.get_project_path(
            'Data/Csv/FeaData/_A/fea_all_label30_dur31_sl1.csv'))
    test_x = pd.read_csv(
        com.get_project_path(
            'Data/Csv/FeaData/_A/fea_all_label31_dur31_sl1_p.csv'))

    train_y_ui = sp.get_csv_label(
        pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all.csv')),
        30)
    test_y_ui = sp.get_csv_label(
        pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_p.csv')),
        31)

    print('特征数量: ' + str(len(train_x.columns) - 2))
    print('训练集数量: ' + str(len(train_x)))

    train_ui = train_x.loc[:, ['user_id', 'item_id']]
    test_ui = test_x.loc[:, ['user_id', 'item_id']]

    train_y = sp.get_ui_id(train_x).isin(sp.get_ui_id(train_y_ui)).replace({
        True:
        1,
        False:
        0
    })
    test_y = sp.get_ui_id(test_x).isin(sp.get_ui_id(test_y_ui)).replace({
        True:
        1,
        False:
        0
    })
    # ########### 模型 ############ #
    pre_label = xgb_pre(train_x.drop(['user_id', 'item_id'], axis=1),
                        train_y,
                        test_x.drop(['user_id', 'item_id'], axis=1),
                        test_y=test_y,
                        if_save_imp=False)

    tmp = list(pre_label.sort_values(ascending=False))[700]
    pre_label = pre_label.apply(lambda a: a >= tmp).replace({
        True: 1,
        False: 0
    })
    test_x['label'] = pre_label
    test_pre_ui = test_x[test_x['label'] == 1].loc[:, ['user_id', 'item_id']]
    sp.f1_score(test_pre_ui,
                test_y_ui.loc[:, ['user_id', 'item_id']],
                if_print=True)
    del test_x['label']
def run():
    data_all = pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all.csv'))

    train_x = pd.read_csv(com.get_project_path(FEATURE_PATH+'fea_all_label31_dur31_sl3.csv'))
    train_x['ui_id'] = sp.get_ui_id(train_x)
    test_x = pd.read_csv(com.get_project_path(FEATURE_PATH+'fea_all_label32_dur31_sl3_p.csv'))
    test_x['ui_id'] = sp.get_ui_id(test_x)

    train_y = sp.get_csv_label(data_all, 31)
    train_y['ui_id'] = sp.get_ui_id(train_y)
    train_y = train_x['ui_id'].isin(train_y['ui_id']).replace({True: 1, False: 0})

    print('特征数量: '+str(len(train_x.columns)-3))
    print('训练集数量: ' + str(len(train_x)))
    # ########### 搞模型 ############ #
    pre_label = xgb_pre(train_x.drop(['user_id', 'item_id', 'ui_id'], axis=1), train_y,
                        test_x.drop(['user_id', 'item_id', 'ui_id'], axis=1))

    tmp = list(pre_label.sort_values(ascending=False))[500]
    pre_label = pre_label.apply(lambda a: a>=tmp).replace({True: 1, False: 0})
    test_x['label'] = pre_label
    csv_fea_label24_dur14_p = test_x[test_x['label']==1].loc[:, ['user_id', 'item_id']]
    save_name = '_A_02_xgb_202001032331.csv'
    com.save_csv(csv_fea_label24_dur14_p.loc[:, ['user_id', 'item_id']], com.get_project_path(RESULT_PATH), save_name)
def xgb_pre(train_x,
            train_y,
            test_x,
            num_round=500,
            params=None,
            test_y=None,
            if_save_imp=True):
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dtest = xgb.DMatrix(test_x, label=test_y)
    if params is None:
        params = {
            'objective': 'binary:logistic',
            # 'objective': 'rank:pairwise',
            'eta': 0.01,
            'max_depth': 5,
            'colsample_bytree': 0.8,
            'subsample': 0.8,
            'min_child_weight': 16,
            'tree_method': 'exact',
            # 'gamma': 0.1,
            # 'scale_pos_weight': 10,
            # 'max_delta_step': 0.7,
            # 'eval_metric': 'auc',
        }
    watchlist = [(dtrain, 'train'), (dtest, 'test')]
    if test_y is None:
        bst = xgb.train(params, dtrain, num_boost_round=num_round)
    else:
        bst = xgb.train(params, dtrain, num_round, watchlist, feval=evalerror)

    if if_save_imp:
        imp_dict = bst.get_fscore(fmap='')
        imp = pd.DataFrame({
            'column': list(imp_dict.keys()),
            'importance': list(imp_dict.values())
        })
        com.save_csv(imp.sort_values(by='importance'),
                     com.get_project_path('Data/Temp/'),
                     'xgb-val_importance.csv')
    pre_label = pd.Series(bst.predict(dtest))
    return pre_label
def run():
    csv_data_all = pd.read_csv(
        com.get_project_path('Data/Csv/OriData/csv_data_all.csv'))
    csv_data_item = pd.read_csv(com.get_project_path(
        'Data/Csv/OriData/tianchi_fresh_comp_train_item.csv'),
                                header=0,
                                names=['item_id', 'item_geo', 'item_cate'])
    csv_data_p = csv_data_all[csv_data_all['item_id'].isin(
        csv_data_item['item_id'])]

    # 测试代码时解注下面一条
    # csv_data_all = pd.read_csv(com.get_project_path('Data/Csv/OriData/csv_data_all_h1w.csv'))

    # 对time处理下, 根据需求自行注释可节省大量时间
    csv_data_all['time'] = pd.to_datetime(csv_data_all['time'],
                                          format='%Y%m%d %H')
    # csv_data_all['hour'] = csv_data_all['time'].dt.hour
    csv_data_all['time'] = csv_data_all['time'].dt.normalize()
    # csv_data_all['week'] = csv_data_all['time'].apply(lambda a: a.weekday()+1)
    # csv_data_all['day'] = csv_data_all['time'].dt.day
    csv_data_all['day_rank'] = csv_data_all['time'].rank(
        method='dense').apply(lambda a: int(a))

    # ###### 文字分析 ###### #
    '''
    最小日期 2014-11-18 00:00:00
    最大日期 2014-12-18 23:00:00
    '''
    # print(min(csv_data_all['time']))
    # print(max(csv_data_all['time']))
    '''
    总人数 20000
    总商品数 4758484
    '''
    # print(len(set(csv_data_all['user_id'])))
    # print(len(set(csv_data_all['item_id'])))
    '''
    同一商品只有一种类型
    '''
    # print(len(set(csv_data_item['item_id'])))
    # print(len(set(csv_data_item['item_id'].apply(lambda a: str(a)) + csv_data_item['item_cate'].apply(lambda a: str(a)))))
    '''
    同一商品会有多个经纬度
    '''
    # print(len(set(csv_data_item['item_id'])))
    # print(len(set(csv_data_item['item_id'].apply(lambda a: str(a)) + csv_data_item['item_geo'].apply(lambda a: str(a)))))
    '''
    同一用户会有多个经纬度
    '''
    # print(len(set(csv_data_all['user_id'])))
    # print(len(set(csv_data_all['user_id'].apply(lambda a: str(a)) + csv_data_all['user_geo'].apply(lambda a: str(a)))))
    '''
    找到异常用户
    '''
    # csv_user_bh_count = com.pivot_table_plus(csv_data_all, 'user_id', 'item_id', 'count', 'bh_count')
    # csv_user_day_count = com.pivot_table_plus(csv_data_all, 'user_id', 'day_rank',  com.count_with_drop_duplicates_for_series, 'day_count')
    # csv_user_bh_count = pd.merge(csv_user_bh_count, csv_user_day_count, on='user_id', how='left')
    # csv_user_bh_count['bh_count_mean'] = csv_user_bh_count['bh_count'] / csv_user_bh_count['day_count']
    # csv_user_bh_count = csv_user_bh_count.sort_values(by='bh_count_mean', ascending=False).head(15)
    # print(csv_user_bh_count)
    #
    # csv_user_4_count = com.pivot_table_plus(csv_data_all[csv_data_all['user_id'].isin(csv_user_bh_count['user_id']) & (csv_data_all['beh_type']==4)],
    #                                         'user_id', 'item_id', 'count', 'bh4_count')
    # print(csv_user_4_count)
    '''
    经纬度中的字符set{'d', 'n', '4', '3', 'l', 'e', 'j', 'p', 'h', 
    't', '_', 'c', 'm', '5', 'v', '7', 'o', 'k', 's', '9', '0', 
    'g', 'w', 'r', 'u', 'q', '1', 'f', '2', 'a', 'b', 'i', '6'}
    其中'_'来自'input_data_is_error'
    整理下,正常的set为:012345679abcdefghijklmnopqrstuvw,缺少 8 x y z
    '''
    # set_geo = set(list(csv_data_all['item_geo'].dropna())+list(csv_data_all['user_geo'].dropna()))
    # str_geo = str(set_geo).replace('\'', '').replace(',', '').replace(' ', '')[1:-1]
    # print(set(str_geo))
    '''
    销售量 全集/子集
    大于1 的商品有 31235/3010 件
    大于2 的商品有 11759/1090 件
    大于10 的商品有 504/62 件
    大于20 的商品有 107/10 件
    大于50 的商品有 24/1 件
    '''
    # csv_data_cate4 = csv_data_all[csv_data_all['beh_type']==4]
    # csv_data_cate4 = pd.pivot_table(csv_data_cate4, index='item_id', values='user_id', aggfunc='count').reset_index()
    # print(csv_data_cate4[csv_data_cate4['user_id']>1])
    # print(csv_data_cate4[csv_data_cate4['user_id']>2])
    # print(csv_data_cate4[csv_data_cate4['user_id']>10])
    # print(csv_data_cate4[csv_data_cate4['user_id']>20])
    # print(csv_data_cate4[csv_data_cate4['user_id']>50])

    # csv_data_cate4 = csv_data_p[csv_data_p['beh_type']==4]
    # csv_data_cate4 = pd.pivot_table(csv_data_cate4, index='item_id', values='user_id', aggfunc='count').reset_index()
    # print(csv_data_cate4[csv_data_cate4['user_id']>1])
    # print(csv_data_cate4[csv_data_cate4['user_id']>2])
    # print(csv_data_cate4[csv_data_cate4['user_id']>10])
    # print(csv_data_cate4[csv_data_cate4['user_id']>20])
    # print(csv_data_cate4[csv_data_cate4['user_id']>50])
    '''
    有人会同一天多次买多种商品,目测的
    '''
    '''
    总的行为有23291027条
    对于子集商品只有2084859条
    '''
    # print(len(csv_data_all))
    # print(len(csv_data_all[csv_data_all['item_id'].isin(csv_data_item['item_id'])]))
    '''
    全集商品4758484种,分类9557种
    子集商品422858种,分类1054种
    '''
    # print(len(set(csv_data_all['item_id'])))
    # print(len(set(csv_data_all['item_cate'])))
    #
    # print(len(set(csv_data_item['item_id'])))
    # print(len(set(csv_data_item['item_cate'])))

    # ##### 图分析 ###### #
    '''
    销售长达x天的商品数量(两张图)
    # '''
    # csv_item_count_by_sale_day_count = csv_data_all[csv_data_all['beh_type']==4].copy().loc[:, ['day_rank', 'item_id']]
    # csv_item_count_by_sale_day_count = pd.pivot_table(csv_item_count_by_sale_day_count, index='item_id', values='day_rank',
    #                                                   aggfunc=com.count_with_drop_duplicates_for_series).reset_index().rename(columns={'day_rank': 'sale_day_count'})
    # # print(csv_item_count_by_sale_day_count)
    # csv_item_count_by_sale_day_count = pd.pivot_table(csv_item_count_by_sale_day_count, index='sale_day_count', values='item_id',
    #                                                   aggfunc='count').rename(columns={'item_id': 'item_count'}).sort_values(by='item_count', ascending=False)
    # # print(csv_item_count_by_sale_day_count)
    # csv_item_count_by_sale_day_count.plot.bar()
    # plt.xlabel('sale days count')
    # plt.savefig(com.get_project_path('Data/Graph/item_count_by_sale_day_count.jpg'))
    # # plt.show()
    #
    # csv_item_count_by_sale_day_count.tail(21).plot.bar()
    # plt.xlabel('sale days count')
    # plt.savefig(com.get_project_path('Data/Graph/item_count_by_sale_day_count_t21.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    记录长达x天的商品数量(两张图)
    '''
    # csv_item_count_by_log_day_count = csv_data_all.copy().loc[:, ['day_rank', 'item_id']]
    # csv_item_count_by_log_day_count = pd.pivot_table(csv_item_count_by_log_day_count, index='item_id', values='day_rank',
    #                                                  aggfunc=com.count_with_drop_duplicates_for_series).reset_index().rename(columns={'day_rank': 'log_day_count'})
    # # print(csv_item_count_by_log_day_count)
    # csv_item_count_by_log_day_count = pd.pivot_table(csv_item_count_by_log_day_count, index='log_day_count', values='item_id',
    #                                                  aggfunc='count').rename(columns={'item_id': 'item_count'}).sort_values(by='item_count', ascending=False)
    # # print(csv_item_count_by_log_day_count)
    # csv_item_count_by_log_day_count.plot.bar()
    # plt.xlabel('log days count')
    # plt.savefig(com.get_project_path('Data/Graph/item_count_by_log_day_count.jpg'))
    # # plt.show()
    #
    # csv_item_count_by_log_day_count.tail(21).plot.bar()
    # plt.xlabel('log days count')
    # plt.savefig(com.get_project_path('Data/Graph/item_count_by_log_day_count_t21.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    购买长达x天的个人数量
    '''
    # csv_user_count_by_sale_day_count = csv_data_all[csv_data_all['beh_type']==4].copy().loc[:, ['day_rank', 'user_id']]
    # csv_user_count_by_sale_day_count = pd.pivot_table(csv_user_count_by_sale_day_count, index='user_id', values='day_rank',
    #                                                   aggfunc=com.count_with_drop_duplicates_for_series).reset_index().rename(columns={'day_rank': 'sale_day_count'})
    # # print(csv_user_count_by_sale_day_count)
    # csv_user_count_by_sale_day_count = pd.pivot_table(csv_user_count_by_sale_day_count, index='sale_day_count', values='user_id',
    #                                                   aggfunc='count').rename(columns={'user_id': 'user_count'}).sort_values(by='user_count', ascending=False)
    # # print(csv_user_count_by_sale_day_count)
    # csv_user_count_by_sale_day_count.plot.bar()
    # plt.xlabel('sale days count')
    # plt.savefig(com.get_project_path('Data/Graph/user_count_by_sale_day_count.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    记录长达x天的个人数量
    '''
    # csv_user_count_by_log_day_count = csv_data_all.copy().loc[:, ['day_rank', 'user_id']]
    # csv_user_count_by_log_day_count = pd.pivot_table(csv_user_count_by_log_day_count, index='user_id', values='day_rank',
    #                                                  aggfunc=com.count_with_drop_duplicates_for_series).reset_index().rename(columns={'day_rank': 'log_day_count'})
    # # print(csv_user_count_by_log_day_count)
    # csv_user_count_by_log_day_count = pd.pivot_table(csv_user_count_by_log_day_count, index='log_day_count', values='user_id',
    #                                                  aggfunc='count').rename(columns={'user_id': 'user_count'}).sort_values(by='user_count', ascending=False)
    # # print(csv_user_count_by_log_day_count)
    # csv_user_count_by_log_day_count.plot.bar()
    # plt.xlabel('log days count')
    # plt.savefig(com.get_project_path('Data/Graph/user_count_by_log_day_count.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    商品全集/子集 销售前100计数
    '''
    # csv_item_sale_by_user = csv_data_all[(csv_data_all['beh_type']==4) & csv_data_all['item_id'].isin(csv_data_item['item_id'])].copy().loc[:, ['user_id', 'item_id']]
    # csv_item_sale_by_user = pd.pivot_table(csv_item_sale_by_user, index='item_id', values='user_id', aggfunc='count').rename(columns={'user_id': 'item_sale'}).sort_values(by='item_sale', ascending=False).head(100)
    #
    # csv_item_sale_by_user.plot.bar()
    # plt.xlabel('items')
    # plt.xticks(np.arange(0, 101, 10), np.arange(0, 101, 10))
    # plt.savefig(com.get_project_path('Data/Graph/item_sale_by_item_P.jpg'))
    # plt.show()
    # gc.collect()

    # csv_item_sale_by_user = csv_data_all[csv_data_all['beh_type']==4].copy().loc[:, ['user_id', 'item_id']]
    # csv_item_sale_by_user = pd.pivot_table(csv_item_sale_by_user, index='item_id', values='user_id', aggfunc='count').rename(columns={'user_id': 'item_sale'}).sort_values(by='item_sale', ascending=False).head(100)
    #
    # csv_item_sale_by_user.plot.bar()
    # plt.xlabel('items')
    # plt.xticks(np.arange(0, 101, 10), np.arange(0, 101, 10))
    # plt.savefig(com.get_project_path('Data/Graph/item_sale_by_item.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    商品记录计数
    '''
    # csv_item_log_by_user = csv_data_all.copy().loc[:, ['user_id', 'item_id']]
    # csv_item_log_by_user = pd.pivot_table(csv_item_log_by_user, index='item_id', values='user_id',
    #                                       aggfunc='count').rename(columns={'user_id': 'item_log'}).sort_values(by='item_log', ascending=False).head(100)
    #
    # csv_item_log_by_user.plot.bar()
    # plt.xlabel('items')
    # plt.xticks(np.arange(0, 101, 10), np.arange(0, 101, 10))
    # plt.savefig(com.get_project_path('Data/Graph/item_log_by_item.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    个人购买计数
    '''
    # csv_item_sale_by_user = csv_data_all[csv_data_all['beh_type']==4].copy().loc[:, ['user_id', 'item_id']]
    # csv_item_sale_by_user = pd.pivot_table(csv_item_sale_by_user, index='user_id', values='item_id',
    #                                        aggfunc='count').rename(columns={'item_id': 'item_sale'}).sort_values(by='item_sale', ascending=False)
    #
    # csv_item_sale_by_user.plot.bar()
    # plt.xticks([])
    # plt.xlabel('users')
    # plt.xticks(np.arange(0, 20001, 1000), np.arange(0, 20001, 1000), rotation=60)
    # plt.savefig(com.get_project_path('Data/Graph/item_sale_by_user.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    个人记录计数
    '''
    # csv_item_log_by_user = csv_data_all.copy().loc[:, ['user_id', 'item_id']]
    # csv_item_log_by_user = pd.pivot_table(csv_item_log_by_user, index='user_id', values='item_id',
    #                                       aggfunc='count').rename(columns={'item_id': 'item_log'}).sort_values(by='item_log', ascending=False)
    #
    # csv_item_log_by_user.plot.bar()
    # plt.xticks([])
    # plt.xlabel('users')
    # plt.xticks(np.arange(0, 20001, 1000), np.arange(0, 20001, 1000), rotation=60)
    # plt.savefig(com.get_project_path('Data/Graph/item_log_by_user.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    商品种类记录当天的增加占比和减少减少
    eg: day1[a, b, c], day2[b, c, d, e]
    increase_rate_of_log_count = [d, e] / [b, c, d, e] = 1/2
    decrement_rate_of_log_count = [a] / [a, b, c] = 1/3
    '''
    # csv_data_all_copy = csv_data_all.copy().loc[:, ['day_rank', 'item_id']]
    # csv_item_count_by_day_rank = pd.pivot_table(csv_data_all_copy, index='day_rank', values='item_id',
    #                                             aggfunc=com.count_with_drop_duplicates_for_series).reset_index()
    # csv_item_count_by_day_rank['increase_rate_of_log_count'] = [np.nan] + [
    #     len(com.get_difference_for_series(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank]['item_id'],
    #                                       csv_data_all_copy[csv_data_all_copy['day_rank'] == (day_rank - 1)][
    #                                           'item_id'])) /
    #     len(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank]) for day_rank in range(2, 32)]
    # csv_item_count_by_day_rank['decrement_rate_of_log_count'] = [np.nan] + [
    #     len(com.get_difference_for_series(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank - 1]['item_id'],
    #                                       csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank]['item_id'])) /
    #     len(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank - 1]) for day_rank in range(2, 32)]
    #
    # del csv_item_count_by_day_rank['item_id']
    # csv_item_count_by_day_rank = csv_item_count_by_day_rank.set_index('day_rank')
    #
    # csv_item_count_by_day_rank.plot()
    # plt.xticks(range(1, 32, 2))
    # plt.xlabel('day rank')
    # plt.savefig(com.get_project_path('Data/Graph/item_inc&dec_rate_by_day_rank.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    去重人数当天的增加占比和减少减少
    '''
    # csv_data_all_copy = csv_data_all.copy().loc[:, ['day_rank', 'user_id']]
    # csv_item_count_by_day_rank = pd.pivot_table(csv_data_all_copy, index='day_rank', values='user_id', aggfunc=com.count_with_drop_duplicates_for_series).reset_index()
    # csv_item_count_by_day_rank['increase_rate_of_user_count'] = [np.nan] + [
    #     len(com.get_difference_for_series(csv_data_all_copy[csv_data_all_copy['day_rank']==day_rank]['user_id'], csv_data_all_copy[csv_data_all_copy['day_rank']==(day_rank-1)]['user_id'])) /
    #     len(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank]) for day_rank in range(2, 32)]
    # csv_item_count_by_day_rank['decrement_rate_of_user_count'] = [np.nan] + [
    #     len(com.get_difference_for_series(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank-1]['user_id'], csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank]['user_id'])) /
    #     len(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank-1]) for day_rank in range(2, 32)]
    #
    # del csv_item_count_by_day_rank['user_id']
    # csv_item_count_by_day_rank = csv_item_count_by_day_rank.set_index('day_rank')
    #
    # csv_item_count_by_day_rank.plot()
    # plt.xticks(range(1, 32, 2))
    # plt.xlabel('day rank')
    # plt.savefig(com.get_project_path('Data/Graph/user_inc&dec_rate_by_day_rank.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    五个星期的记录对比图
    '''
    # csv_data_week1 = csv_data_all[csv_data_all['day_rank']<=6]
    # csv_data_week1.loc[:, ['day_rank']] = csv_data_week1['day_rank']+1
    # csv_data_week1 = pd.pivot_table(csv_data_week1, index='day_rank', values='item_id', aggfunc='count')
    #
    # csv_data_week2 = csv_data_all[(csv_data_all['day_rank']>6) & (csv_data_all['day_rank']<=13)]
    # csv_data_week2.loc[:, ['day_rank']] = csv_data_week2['day_rank']-6
    # csv_data_week2 = pd.pivot_table(csv_data_week2, index='day_rank', values='item_id', aggfunc='count')
    #
    # csv_data_week3 = csv_data_all[(csv_data_all['day_rank']>13) & (csv_data_all['day_rank']<=20)]
    # csv_data_week3.loc[:, ['day_rank']] = csv_data_week3['day_rank']-13
    # csv_data_week3 = pd.pivot_table(csv_data_week3, index='day_rank', values='item_id', aggfunc='count')
    #
    # csv_data_week4 = csv_data_all[(csv_data_all['day_rank']>20) & (csv_data_all['day_rank']<=27)]
    # csv_data_week4.loc[:, ['day_rank']] = csv_data_week4['day_rank']-20
    # csv_data_week4 = pd.pivot_table(csv_data_week4, index='day_rank', values='item_id', aggfunc='count')
    #
    # csv_data_week5 = csv_data_all[csv_data_all['day_rank']>27]
    # csv_data_week5.loc[:, ['day_rank']] = csv_data_week5['day_rank']-27
    # csv_data_week5 = pd.pivot_table(csv_data_week5, index='day_rank', values='item_id', aggfunc='count')
    #
    # csv_data_weeks = pd.concat([csv_data_week1, csv_data_week2, csv_data_week3, csv_data_week4, csv_data_week5], axis=1)
    # csv_data_weeks.columns=['week1', 'week2', 'week3', 'week4', 'week5']
    # csv_data_weeks = csv_data_weeks.fillna(np.mean(csv_data_weeks)//2)
    # csv_data_weeks.plot.bar()
    # plt.ylabel('sale count')
    # plt.xlabel('day of the week')
    # plt.savefig(com.get_project_path('Data/Graph/log_count_by_week.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    五个星期的销售对比图
    '''
    # csv_data_all_copy = csv_data_all[csv_data_all['beh_type']==4].copy()
    # csv_data_week1 = csv_data_all_copy[csv_data_all_copy['day_rank']<=6]
    # csv_data_week1.loc[:, ['day_rank']] = csv_data_week1['day_rank']+1
    # csv_data_week1 = pd.pivot_table(csv_data_week1, index='day_rank', values='item_id', aggfunc='count')
    #
    # csv_data_week2 = csv_data_all_copy[(csv_data_all_copy['day_rank']>6) & (csv_data_all_copy['day_rank']<=13)]
    # csv_data_week2.loc[:, ['day_rank']] = csv_data_week2['day_rank']-6
    # csv_data_week2 = pd.pivot_table(csv_data_week2, index='day_rank', values='item_id', aggfunc='count')
    #
    # csv_data_week3 = csv_data_all_copy[(csv_data_all_copy['day_rank']>13) & (csv_data_all_copy['day_rank']<=20)]
    # csv_data_week3.loc[:, ['day_rank']] = csv_data_week3['day_rank']-13
    # csv_data_week3 = pd.pivot_table(csv_data_week3, index='day_rank', values='item_id', aggfunc='count')
    #
    # csv_data_week4 = csv_data_all_copy[(csv_data_all_copy['day_rank']>20) & (csv_data_all_copy['day_rank']<=27)]
    # csv_data_week4.loc[:, ['day_rank']] = csv_data_week4['day_rank']-20
    # csv_data_week4 = pd.pivot_table(csv_data_week4, index='day_rank', values='item_id', aggfunc='count')
    #
    # csv_data_week5 = csv_data_all_copy[csv_data_all_copy['day_rank']>27]
    # csv_data_week5.loc[:, ['day_rank']] = csv_data_week5['day_rank']-27
    # csv_data_week5 = pd.pivot_table(csv_data_week5, index='day_rank', values='item_id', aggfunc='count')
    #
    # csv_data_weeks = pd.concat([csv_data_week1, csv_data_week2, csv_data_week3, csv_data_week4, csv_data_week5], axis=1)
    # csv_data_weeks.columns=['week1', 'week2', 'week3', 'week4', 'week5']
    # csv_data_weeks = csv_data_weeks.fillna(np.mean(csv_data_weeks)//2)
    # csv_data_weeks.plot.bar()
    # plt.ylabel('log count')
    # plt.xlabel('day of the week')
    # plt.savefig(com.get_project_path('Data/Graph/sale_count_by_week.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    按日期排序的销量图
    '''
    # csv_data_all_copy = csv_data_all[csv_data_all['beh_type']==4].copy()
    # csv_data_all_copy = pd.pivot_table(csv_data_all_copy, index='day_rank', values='item_id', aggfunc='count')
    # csv_data_all_copy.plot(color='g', kind='bar')
    # plt.xlabel('day rank')
    # plt.legend(['item sale'])
    # plt.savefig(com.get_project_path('Data/Graph/item_sale_by_day_rank.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    按日期排序的记录量量图
    '''
    # csv_data_all_copy = csv_data_all.copy()
    # csv_data_all_copy = pd.pivot_table(csv_data_all_copy, index='day_rank', values='item_id', aggfunc='count')
    # csv_data_all_copy.plot(color='g', kind='bar')
    # plt.xlabel('day rank')
    # plt.legend(['log count'])
    # plt.savefig(com.get_project_path('Data/Graph/log_count_by_day_rank.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    按日期排序的商品种类图
    '''
    # csv_data_all_copy = csv_data_all.copy()
    # csv_data_all_copy = pd.pivot_table(csv_data_all_copy, index='day_rank', values='item_id', aggfunc=com.count_with_drop_duplicates_for_series)
    # csv_data_all_copy.plot(color='b', kind='bar')
    # plt.xlabel('day rank')
    # plt.legend(['item count'])
    # plt.savefig(com.get_project_path('Data/Graph/item_count_by_day_rank.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    按日期排序的去重人数图
    '''
    # csv_data_all_copy = csv_data_all.copy()
    # csv_data_all_copy = pd.pivot_table(csv_data_all_copy, index='day_rank', values='user_id', aggfunc=com.count_with_drop_duplicates_for_series)
    # csv_data_all_copy.plot(color='b', kind='bar')
    # plt.xlabel('day rank')
    # plt.legend(['user count'])
    # plt.savefig(com.get_project_path('Data/Graph/user_count_by_day_rank.jpg'))
    # # plt.show()
    # gc.collect()
    '''
    只在某一天出现的用户和商品计数 (没优化,跑的很慢)
    '''
    user_len = []
    item_len = []

    for i in range(1, 31):
        csv_user_day_count = com.pivot_table_plus(
            csv_data_all, 'user_id', 'day_rank',
            com.count_with_drop_duplicates_for_series, 'day_count')
        csv_user_day_count = csv_user_day_count[csv_user_day_count['day_count']
                                                == 1]
        csv_user_only1212 = csv_data_all[csv_data_all['day_rank'] ==
                                         i].drop_duplicates('user_id')
        csv_user_only1212 = csv_user_only1212[
            csv_user_only1212['user_id'].isin(csv_user_day_count['user_id'])]
        user_len += [len(csv_user_only1212)]
    user_len = pd.DataFrame({
        'day_rank': range(1, 31),
        'user_count': user_len
    }).set_index('day_rank')
    plt.plot(user_len)
    plt.savefig(
        com.get_project_path(
            'Data/Graph/user_only_one_day_count_by_day_rank.jpg'))
    plt.show()
Пример #9
0
def get_feature(data_all,
                data_p,
                label_day_rank,
                duration=7,
                p_only=True,
                data_item=None,
                save=False):
    # 第一部分: 用户的特征
    fea_user_path = get_user_feature(data_all=data_all,
                                     data_p=data_p,
                                     data_item=data_item,
                                     label_day_rank=label_day_rank,
                                     duration=duration,
                                     p_only=p_only,
                                     save=True)
    # fea_user_path = com.get_project_path(FEATURE_PATH) + get_save_name(label_day_rank, duration, p_only, index='user')

    # 第二部分: 商品的特征
    fea_item_path = get_item_feature(data_all=data_all,
                                     data_p=data_p,
                                     data_item=data_item,
                                     label_day_rank=label_day_rank,
                                     duration=duration,
                                     p_only=p_only,
                                     save=True)
    # fea_item_path = com.get_project_path(FEATURE_PATH) + get_save_name(label_day_rank, duration, p_only, index='item')

    # 第三部分: 用户X商品 的特征
    fea_ui_path = get_ui_feature(data_all=data_all,
                                 data_p=data_p,
                                 data_item=data_item,
                                 label_day_rank=label_day_rank,
                                 duration=duration,
                                 p_only=p_only,
                                 save=True)
    # fea_ui_path = com.get_project_path(FEATURE_PATH) + get_save_name(label_day_rank, duration, p_only, index='ui')

    # 组合特征
    data_fea = pd.read_csv(fea_ui_path).loc[:, ['user_id', 'item_id']]
    data_fea = pd.merge(data_fea,
                        pd.read_csv(fea_user_path),
                        on='user_id',
                        how='left')
    data_fea = pd.merge(data_fea,
                        pd.read_csv(fea_item_path),
                        on='item_id',
                        how='left')
    data_fea = pd.merge(data_fea,
                        pd.read_csv(fea_ui_path),
                        on=['user_id', 'item_id'],
                        how='left')

    if save is True:
        save_name = get_save_name(label_day_rank,
                                  duration,
                                  p_only,
                                  index='all')
        com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name)
        return com.get_project_path(FEATURE_PATH) + save_name
    else:
        return data_fea
Пример #10
0
def get_ui_feature(data_all,
                   data_p,
                   label_day_rank,
                   duration=7,
                   p_only=True,
                   data_item=None,
                   save=False):
    data_all = data_all[(data_all['day_rank'] >= label_day_rank - duration)
                        & (data_all['day_rank'] <= label_day_rank - 1)]
    data_p = data_p[(data_p['day_rank'] >= label_day_rank - duration)
                    & (data_p['day_rank'] <= label_day_rank - 1)]

    if p_only is True:
        data_fea = data_p[
            (data_p['day_rank'] >= label_day_rank - SET_LENGTH)
            & (data_p['beh_type'] == 1
               )].loc[:, ['user_id', 'item_id', 'item_cate', 'ui_id', 'uc_id'
                          ]].drop_duplicates()
    else:
        data_fea = data_all[
            (data_all['day_rank'] >= label_day_rank - SET_LENGTH)
            & (data_all['beh_type'] == 1
               )].loc[:, ['user_id', 'item_id', 'item_cate', 'ui_id', 'uc_id'
                          ]].drop_duplicates()

    # 用户 前1天当天/前2天当天/前3天当天 购买了/浏览了 几次这个商品
    for ago_time in [1, 2, 3]:
        for beh_type in [1, 2, 3, 4]:
            fea_name = 'beh_type_' + str(beh_type) + '_count&ui_id&' + str(
                ago_time) + '_day_ago'
            feature = com.pivot_table_plus(
                data_all[(data_all['beh_type'] == beh_type)
                         & (data_all['day_rank'] == label_day_rank -
                            ago_time)],
                index='ui_id',
                values='user_id',
                aggfunc='count',
                new_name=fea_name)
            data_fea = pd.merge(data_fea, feature, on='ui_id', how='left')
            data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
            print('# -- ' + fea_name + ' complete -- #')

    # 用户是否 收藏/购买 过这个商品
    data_fea['beh_type_4_if&ui_id'] = (data_fea['ui_id'].isin(
        data_all[data_all['beh_type'] == 4]['ui_id'])).replace({
            True: 1,
            False: 0
        })
    data_fea['beh_type_2_if&ui_id'] = (data_fea['ui_id'].isin(
        data_all[data_all['beh_type'] == 2]['ui_id'])).replace({
            True: 1,
            False: 0
        })

    # 用户与这个商品最后一次交互 是购买1 还是收藏1.5 还是浏览2 还是加购物车4
    fea_name = 'beh_type_?_last&ui_id'
    feature = data_all.copy()
    feature['tmp'] = feature['day_rank'] * 100 + feature['hour']
    feature['rank'] = feature.groupby('ui_id')['tmp'].rank(ascending=0)
    feature = feature[feature['rank'] == 1]
    data_fea = pd.merge(data_fea,
                        feature.loc[:, ['ui_id', 'beh_type']],
                        on='ui_id',
                        how='left')
    data_fea[fea_name] = data_fea['beh_type'].replace({
        1: 2,
        2: 1.5,
        3: 4,
        4: 1
    }).fillna(0)
    del data_fea['beh_type']

    # 商品在 全集/子集 是否是用户最后的交互对象
    for data_index in ['data_all', 'data_p']:
        if data_index == 'data_all': data = data_all
        else: data = data_p
        fea_name = 'is_last&ui_id&' + data_index
        feature = data.loc[:, ['user_id', 'ui_id', 'day_rank', 'hour'
                               ]].sort_values(
                                   by=['user_id', 'day_rank', 'hour'],
                                   ascending=[0, 0,
                                              0]).drop_duplicates('user_id')
        data_fea[fea_name] = (data_fea['ui_id'].isin(
            feature['ui_id'])).replace({
                True: 1,
                False: 0
            })
        print('# -- ' + fea_name + ' complete -- #')

    # 商品是用户在 全集/子集 倒数第几个交互对象
    for data_index in ['data_all', 'data_p']:
        if data_index == 'data_all': data = data_all
        else: data = data_p
        fea_name = 'last_?&ui_id&' + data_index
        feature = data.loc[:, ['ui_id', 'day_rank', 'hour']]
        feature['tmp'] = feature['day_rank'] * 100 + feature['hour']
        feature['rank'] = feature.groupby('ui_id')['tmp'].rank(method='dense',
                                                               ascending=1)
        feature = feature.sort_values(by=['rank', 'ui_id'],
                                      ascending=[True, True]).drop_duplicates(
                                          ['ui_id']).loc[:, ['ui_id', 'rank']]
        data_fea = pd.merge(data_fea, feature, on='ui_id',
                            how='left').fillna(max(feature['rank'] + 1))
        data_fea = data_fea.rename(columns={'rank': fea_name})
        print('# -- ' + fea_name + ' complete -- #')

    # 用户最后一次 浏览/收藏/购物车/购买 该商品/该商品类型 距标签多少小时
    for beh_type in [1, 2, 3, 4]:
        for id_index in ['ui_id', 'uc_id']:
            fea_name = 'beh_type_' + str(
                beh_type) + '_latest_to_now_hour&' + id_index
            feature = data_all[(
                data_all['beh_type'] == beh_type
            )].loc[:, [id_index, 'day_rank', 'hour']].sort_values(
                by=['day_rank',
                    'hour'], ascending=[0, 0]).drop_duplicates(id_index)
            feature[fea_name] = feature['day_rank'].apply(
                lambda a: label_day_rank - label_day_rank)
            feature[fea_name] = (feature[fea_name] * 24) + (24 -
                                                            feature['hour'])
            data_fea = pd.merge(data_fea,
                                feature.loc[:, [id_index, fea_name]],
                                how='left',
                                on=id_index)
            data_fea[fea_name] = data_fea[fea_name].fillna(
                24 * duration).astype(int)
            print('# -- ' + fea_name + ' complete -- #')

    del data_fea['uc_id']
    del data_fea['ui_id']
    del data_fea['item_cate']
    if save is True:
        save_name = get_save_name(label_day_rank, duration, p_only, index='ui')
        com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name)
        return com.get_project_path(FEATURE_PATH) + save_name
    else:
        return data_fea
Пример #11
0
def get_item_feature(data_all,
                     data_p,
                     label_day_rank,
                     duration=7,
                     p_only=True,
                     data_item=None,
                     save=False):
    data_all = data_all[(data_all['day_rank'] >= label_day_rank - duration)
                        & (data_all['day_rank'] <= label_day_rank - 1)]
    data_p = data_p[(data_p['day_rank'] >= label_day_rank - duration)
                    & (data_p['day_rank'] <= label_day_rank - 1)]

    if p_only is True:
        data_fea = data_p[(data_p['day_rank'] >= label_day_rank - SET_LENGTH) &
                          (data_p['beh_type'] == 1
                           )].loc[:,
                                  ['item_id', 'item_cate']].drop_duplicates()
    else:
        data_fea = data_all[
            (data_all['day_rank'] >= label_day_rank - SET_LENGTH)
            & (data_all['beh_type'] == 1
               )].loc[:, ['item_id', 'item_cate']].drop_duplicates()

    # 商品在/商品种类在 总/前1天内/前2天内/前3天内浏览/收藏/购物车/购买 的计数
    for item_index in ['item_id', 'item_cate']:
        for duration_time in [duration, 1, 2, 3]:
            for beh_type in [1, 2, 3, 4]:
                fea_name = 'beh_type_' + str(beh_type) + '_count&' + str(
                    item_index) + '&latest_' + str(duration_time)
                feature = com.pivot_table_plus(
                    data_all[(data_all['beh_type'] == beh_type)
                             & (data_all['day_rank'] >= label_day_rank -
                                duration_time)],
                    index=item_index,
                    values='user_id',
                    aggfunc='count',
                    new_name=fea_name)
                data_fea = pd.merge(data_fea,
                                    feature,
                                    on=item_index,
                                    how='left')
                data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
                print('# -- ' + fea_name + ' complete -- #')

    # 商品的/商品种类的 购买/收藏 的计数在 全集/子集 中的 正/反 排序
    for data_index in ['data_all', 'data_p']:
        for item_index in ['item_id', 'item_cate']:
            for beh_type in [2, 4]:
                for ascending in [0, 1]:
                    if data_index == 'data_all': data = data_all
                    else: data = data_p
                    fea_name = 'count_rank' + str(ascending) + '&' + str(
                        item_index) + '&beh_type_' + str(beh_type) + '&' + str(
                            data_index)
                    feature = com.pivot_table_plus(
                        data[(data['beh_type'] == beh_type)],
                        index=item_index,
                        values='user_id',
                        aggfunc='count',
                        new_name='tmp')
                    data_fea = pd.merge(data_fea,
                                        feature.loc[:, [item_index, 'tmp']],
                                        on=item_index,
                                        how='left')
                    data_fea['tmp'] = data_fea['tmp'].fillna(0)
                    data_fea[fea_name] = data_fea['tmp'].rank(
                        ascending=ascending, method='dense')
                    print('# -- ' + fea_name + ' complete -- #')
                    del data_fea['tmp']

    # 商品/商品类型 被多少人 浏览/收藏/购物车/购买 过
    for item_index in ['item_id', 'item_cate']:
        for beh_type in [1, 2, 3, 4]:
            fea_name = 'user_count&' + item_index + '&' + 'beh_type_' + str(
                beh_type)
            feature = data_all[
                (data_all[item_index].isin(data_fea[item_index]))
                & (data_all['beh_type'] == beh_type)]
            feature = com.pivot_table_plus(
                feature,
                index=item_index,
                values='user_id',
                aggfunc=com.count_with_drop_duplicates_for_series,
                new_name=fea_name)
            data_fea = pd.merge(data_fea, feature, on=item_index, how='left')
            data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
            print('# -- ' + fea_name + ' complete -- #')

    # 商品的/商品类型的 转化率(浏览X购买)
    data_fea['item_id_ctr'] = 1. * data_fea[
        'beh_type_4_count&item_id&latest_' +
        str(duration)] / data_fea['beh_type_1_count&item_id&latest_' +
                                  str(duration)]
    data_fea['item_id_ctr'] = data_fea['item_id_ctr'].fillna(0)
    data_fea['item_cate_ctr'] = 1. * data_fea[
        'beh_type_4_count&item_cate&latest_' +
        str(duration)] / data_fea['beh_type_1_count&item_cate&latest_' +
                                  str(duration)]
    data_fea['item_cate_ctr'] = data_fea['item_cate_ctr'].fillna(0)

    # 商品有几个经纬度
    for item_index in ['item_id', 'item_cate']:
        fea_name = 'geo_count&' + item_index
        feature = data_item[data_item['item_geo_lat'] != -90]
        feature = com.pivot_table_plus(
            feature, item_index, 'item_geo_lat',
            com.count_with_drop_duplicates_for_series, fea_name)
        data_fea = pd.merge(data_fea, feature, on=item_index, how='left')
        data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)

    del data_fea['item_cate']
    if save is True:
        save_name = get_save_name(label_day_rank,
                                  duration,
                                  p_only,
                                  index='item')
        com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name)
        return com.get_project_path(FEATURE_PATH) + save_name
    else:
        return data_fea
Пример #12
0
def get_user_feature(data_all,
                     data_p,
                     label_day_rank,
                     duration=7,
                     p_only=True,
                     data_item=None,
                     save=False):
    data_all = data_all[(data_all['day_rank'] >= label_day_rank - duration)
                        & (data_all['day_rank'] <= label_day_rank - 1)]
    data_p = data_p[(data_p['day_rank'] >= label_day_rank - duration)
                    & (data_p['day_rank'] <= label_day_rank - 1)]

    if p_only is True:
        data_fea = data_p[
            (data_p['day_rank'] >= label_day_rank - SET_LENGTH)
            & (data_p['beh_type'] == 1)].loc[:, ['user_id']].drop_duplicates()
    else:
        data_fea = data_all[
            (data_all['day_rank'] >= label_day_rank - SET_LENGTH)
            & (data_all['beh_type'] == 1)].loc[:,
                                               ['user_id']].drop_duplicates()

    # 用户在 全集中的/子集中的 总/前1天内/前2天内/前3天内  浏览/收藏/购物车/购买 的计数
    for data_index in ['data_all', 'data_p']:
        for duration_time in [duration, 1, 2, 3]:
            for beh_type in [1, 2, 3, 4]:
                if data_index == 'data_all': data = data_all
                else: data = data_p
                fea_name = 'beh_type_' + str(
                    beh_type) + '_count&user&latest_' + str(
                        duration_time) + '&' + str(data_index)
                feature = com.pivot_table_plus(
                    data[(data['beh_type'] == beh_type) &
                         (data['day_rank'] >= label_day_rank - duration_time)],
                    index='user_id',
                    values='beh_type',
                    aggfunc='count',
                    new_name=fea_name)
                data_fea = pd.merge(data_fea,
                                    feature,
                                    on='user_id',
                                    how='left')
                data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
                print('# -- ' + fea_name + ' complete -- #')

    # 用户在 全集中/子集中 浏览/收藏/购物车/购买 过几种商品
    for data_index in ['data_all', 'data_p']:
        for beh_type in [1, 2, 3, 4]:
            if data_index == 'data_all': data = data_all
            else: data = data_p
            fea_name = 'item_count&' + 'user&' + 'beh_type_' + str(
                beh_type) + '&' + str(data_index)
            feature = com.pivot_table_plus(
                data[(data['beh_type'] == beh_type)],
                index='user_id',
                values='item_id',
                aggfunc=com.count_with_drop_duplicates_for_series,
                new_name=fea_name)
            data_fea = pd.merge(data_fea, feature, on='user_id', how='left')
            data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
            print('# -- ' + fea_name + ' complete -- #')

    # 用户在 全集中的/子集中的 转化率
    data_fea['user_ctr&data_all'] = 1. * data_fea[
        'beh_type_4_count&user&latest_' + str(duration) +
        '&data_all'] / data_fea['beh_type_1_count&user&latest_' +
                                str(duration) + '&data_all']
    data_fea['user_ctr&data_all'] = data_fea['user_ctr&data_all'].fillna(0)
    data_fea['user_ctr&data_p'] = 1. * data_fea[
        'beh_type_4_count&user&latest_' + str(duration) +
        '&data_p'] / data_fea['beh_type_1_count&user&latest_' + str(duration) +
                              '&data_p']
    data_fea['user_ctr&data_p'] = data_fea['user_ctr&data_p'].fillna(0)

    # 用户最后一次 浏览/收藏/购物车/购买 距标签多少小时
    for beh_type in [1, 2, 3, 4]:
        fea_name = 'beh_type_' + str(beh_type) + '_latest_to_now_hour&user_id'
        feature = data_all.loc[:, ['user_id', 'day_rank', 'hour']].sort_values(
            by=['user_id', 'day_rank',
                'hour'], ascending=[0, 0, 0]).drop_duplicates('user_id')
        feature[fea_name] = feature['day_rank'].apply(
            lambda a: label_day_rank - a)
        feature[fea_name] = (feature[fea_name] * 24) + (24 - feature['hour'])
        data_fea = pd.merge(data_fea,
                            feature.loc[:, ['user_id', fea_name]],
                            how='left',
                            on='user_id')
        data_fea[fea_name] = data_fea[fea_name].fillna(24 *
                                                       duration).astype(int)
        print('# -- ' + fea_name + ' complete -- #')

    # 用户有几个经纬度
    fea_name = 'geo_count&user_id'
    feature = data_all[data_all['user_geo_lat'] != 90]
    feature = com.pivot_table_plus(feature, 'user_id', 'user_geo_lat',
                                   com.count_with_drop_duplicates_for_series,
                                   fea_name)
    data_fea = pd.merge(data_fea, feature, on='user_id', how='left')
    data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
    print('# -- ' + fea_name + ' complete -- #')

    if save is True:
        save_name = get_save_name(label_day_rank,
                                  duration,
                                  p_only,
                                  index='user')
        com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name)
        return com.get_project_path(FEATURE_PATH) + save_name
    else:
        return data_fea
Пример #13
0
def run():
    csv_data_all = pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all.csv'))
    csv_data_p = pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_p.csv'))

    # 31号全部的购物车记录
    # a = get_result_by_rule1(csv_data_p, beh_type=3, day_rank=31)
    # com.save_csv(a, com.get_project_path('Data/Csv/ResData/_Z/beh_type_3&latest_1_day_201912281826.csv'), 'beh_type_3&latest_1_day.csv')

    '''
    以 前一天所有在购物车 的商品交上去
    '''
    # for i in range(20, 30):
    #     print("第"+str(i+1)+"天为标签")
    #     a = get_result_by_rule1(csv_data_all, beh_type=3, day_rank=i)
    #     b = sp.get_csv_label(csv_data_all, i+1)
    #     print(sp.f1_score(b, a))
    #
    #     a = get_result_by_rule1(csv_data_p, beh_type=3, day_rank=i)
    #     b = sp.get_csv_label(csv_data_p, i+1)
    #     print(sp.f1_score(b, a))

    '''
    以 前一天所有浏览过 的商品交上去
    '''
    # for i in range(20, 30):
    #     print("第"+str(i+1)+"天为标签")
    #     a = get_result_by_rule1(csv_data_all, beh_type=1, day_rank=i)
    #     b = sp.get_csv_label(csv_data_all, i+1)
    #     print(len(a), len(b))
    #     print(sp.f1_score(b, a))
    #
    #     a = get_result_by_rule1(csv_data_p, beh_type=1, day_rank=i)
    #     b = sp.get_csv_label(csv_data_p, i+1)
    #     print(len(a), len(b))
    #     print(sp.f1_score(b, a))

    '''
    以 前一天所有在购物车且收藏过 的商品交上去
    '''
    # for i in range(20, 30):
    #     print("\n第"+str(i+1)+"天为标签")
    #     a = get_result_by_rule2(csv_data_all, day_rank=i)
    #     b = sp.get_csv_label(csv_data_all, i+1)
    #     print(len(a), len(b))
    #     print(sp.f1_score(b, a))
    #
    #     a = get_result_by_rule2(csv_data_p, day_rank=i)
    #     b = sp.get_csv_label(csv_data_p, i+1)
    #     print(len(a), len(b))
    #     print(sp.f1_score(b, a))


    '''
    纯马后炮测试, 交 标签日期中,曾出现在前一天的购物车里 的商品
    '''
    # for i in range(20, 30):
    #     print("\n第" + str(i + 1) + "天为标签")
    #     b = sp.get_csv_label(csv_data_all, i + 1)
    #     a = csv_data_all[(csv_data_all['day_rank']==i) & (csv_data_all['beh_type']==3)].loc[:, ['user_id', 'item_id']].drop_duplicates()
    #     a = a[(a['item_id'] + a['user_id']).isin(b['item_id'] + b['user_id'])]
    #     print(len(a), len(b))
    #     print(sp.f1_score(b, a))
    #
    #     b = sp.get_csv_label(csv_data_p, i + 1)
    #     a = csv_data_p[(csv_data_p['day_rank']==i) & (csv_data_p['beh_type']==3)].loc[:, ['user_id', 'item_id']].drop_duplicates()
    #     a = a[(a['item_id'] + a['user_id']).isin(b['item_id'] + b['user_id'])]
    #     print(len(a), len(b))
    #     print(sp.f1_score(b, a))

    '''
    纯马后炮测试, 交 标签日期中,曾在前一天浏览过 的商品
    '''
    # for i in range(20, 30):
    #     print("\n第" + str(i + 1) + "天为标签")
    #     b = sp.get_csv_label(csv_data_all, i + 1)
    #     a = csv_data_all[(csv_data_all['day_rank']==i) & (csv_data_all['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates()
    #     a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])]
    #     print(len(a), len(b))
    #     print(sp.f1_score(b, a))
    #
    #     b = sp.get_csv_label(csv_data_p, i + 1)
    #     a = csv_data_p[(csv_data_p['day_rank']==i) & (csv_data_p['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates()
    #     a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])]
    #     print(len(a), len(b))
    #     print(sp.f1_score(b, a))

    '''
    纯马后炮测试, 交 标签日期中,曾在前两天浏览过 的商品
    '''
    # for i in range(20, 30):
    #     print("\n第" + str(i + 1) + "天为标签")
    #     b = sp.get_csv_label(csv_data_all, i + 1)
    #     a = csv_data_all[(csv_data_all['day_rank']>=i-1) & (csv_data_all['day_rank']<=i) & (csv_data_all['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates()
    #     a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])]
    #     print(len(a), len(b))
    #     sp.f1_score(b, a, if_print=True)
    #
    #     b = sp.get_csv_label(csv_data_p, i + 1)
    #     a = csv_data_p[(csv_data_p['day_rank']>=i-1) & (csv_data_p['day_rank']<=i) & (csv_data_p['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates()
    #     a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])]
    #     print(len(a), len(b))
    #     sp.f1_score(b, a, if_print=True)

    '''
    纯马后炮测试, 交 标签日期中,曾在前七天浏览过 的商品
    '''
    for i in range(20, 30):
        print("\n第" + str(i + 1) + "天为标签")
        b = sp.get_csv_label(csv_data_all, i + 1)
        a = csv_data_all[(csv_data_all['day_rank']>=i-6) & (csv_data_all['day_rank']<=i) & (csv_data_all['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates()
        a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])]
        print(len(a), len(b))
        sp.f1_score(b, a, if_print=True)

        b = sp.get_csv_label(csv_data_p, i + 1)
        a = csv_data_p[(csv_data_p['day_rank']>=i-6) & (csv_data_p['day_rank']<=i) & (csv_data_p['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates()
        a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])]
        print(len(a), len(b))
        sp.f1_score(b, a, if_print=True)

    '''
    纯马后炮测试, 交 标签日期中,曾有过任何记录 的商品
    '''
    for i in range(20, 30):
        print("\n第" + str(i + 1) + "天为标签")
        b = sp.get_csv_label(csv_data_all, i + 1)
        a = csv_data_all[(csv_data_all['day_rank']<=i) & (csv_data_all['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates()
        a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])]
        print(len(a), len(b))
        sp.f1_score(b, a, if_print=True)

        b = sp.get_csv_label(csv_data_p, i + 1)
        a = csv_data_p[(csv_data_p['day_rank']<=i) & (csv_data_p['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates()
        a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])]
        print(len(a), len(b))
        sp.f1_score(b, a, if_print=True)

    '''