def get_features_all(df,df1): lendf = len(df) df= df.append(df1) del df1 gc.collect() df = docount(df,df,'ALL',['register_type']) df = docount(df,df,'ALL',['device_type']) del df['user_id'] df1 = df[lendf:] df = df[:lendf] return df,df1
def get_features_all(df,df1): lendf = len(df) df= df.append(df1) del df1 gc.collect() df = docount(df,df,'ALL',['register_type']) del df['user_id'] ccc = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#', 'first_day_act$user_id_by_author_id_iq', 'action_type=2$user_id#rate', 'act_page=0$user_id#rate', 'last_day_act$author_video_m', 'action_type=1$user_id#rate', 'act_page=2$user_id#', 'actrate', 'last_day_act$user_id_by_author_id_iq', 'app$user_id#', 'last_day_act_page=1$user_id#', 'act_page=3$user_id#rate', 'last_day_action_type=0$user_id#', 'first_day_act$user_id_by_video_id_iq', 'videorate', 'act_page=1$user_id#rate', 'last2_day_act$user_id_by_author_id_iq', 'last2_day_act$user_id_by_video_id_iq', 'first_day_actrate', 'act_page=2$user_id#rate', 'last_day_actrate', 'first_day_act$author_video_m', 'last2_day_act$author_video_m', 'ALL$register_type#', 'act_page=0$user_id#', 'actrate_gap', 'action_type=3$user_id#rate', 'last_day_act$user_id#', 'act$user_id#', 'last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 'action_type=1$user_id#', 'act_gap', 'action_type=2$user_id#', 'action_type=3$user_id#', 'first_day_act$user_id#', 'act_page=3$user_id#', 'act_page=4$user_id#rate', 'video$user_id#', 'last_day_action_type=1$user_id#', 'act_page=23$user_id#', 'act_page=023$user_id#', 'act_page=4$user_id#', 'last_day_action_type=2$user_id#', 'last_day_action_type=3$user_id#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'last_day_app$user_id#', 'last_day_action_type=4$user_id#', 'action_type=4$user_id#', 'last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0'] ccc1 = [ ] ddd = ['action_type=2$user_id#rate','action_type=1$user_id#rate','last_day_act$user_id_by_author_id_iq', 'last_day_act_page=1$user_id#','act_page=3$user_id#rate','first_day_act$user_id_by_video_id_iq', 'videorate','act_page=1$user_id#rate','last2_day_act$user_id_by_author_id_iq','last2_day_act$user_id_by_video_id_iq', 'act_page=2$user_id#rate','last_day_actrate', 'first_day_act$author_video_m','last2_day_act$author_video_m', 'ALL$register_type#','act_page=0$user_id#','actrate_gap','action_type=3$user_id#rate', 'last_day_act$user_id#','act$user_id#','last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 'action_type=1$user_id#','act_gap', 'action_type=2$user_id#','action_type=3$user_id#', 'first_day_act$user_id#', 'act_page=3$user_id#','act_page=4$user_id#rate', 'video$user_id#', 'last_day_action_type=1$user_id#','act_page=23$user_id#', 'act_page=023$user_id#','act_page=4$user_id#', 'last_day_action_type=2$user_id#','last_day_action_type=3$user_id#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'last_day_app$user_id#','last_day_action_type=4$user_id#', 'action_type=4$user_id#','last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0'] used = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#', 'first_day_act$user_id_by_author_id_iq', 'act_page=0$user_id#rate','last_day_act$author_video_m', 'act_page=2$user_id#','actrate','app$user_id#', 'last_day_action_type=0$user_id#', 'first_day_actrate', 'action_type=5$user_id#rate', ] df = df[used] df1 = df[lendf:] df = df[:lendf] return df,df1
def get_features_all(df, df1): lendf = len(df) df = df.append(df1) del df1 gc.collect() for c in ['act$user_id#']: #df = domean(df,df,'All',['device_type'],c);gc.collect() df = domean(df, df, 'All', ['register_type'], c) gc.collect() #df = dovar(df,df,'All',['register_type'],c);gc.collect() df = docount(df, df, 'ALL', ['register_type']) df = docount(df, df, 'ALL', ['device_type']) del df['user_id'], ccc = [ 'device_type', 'actrate', 'All$register_type_by_act$user_id#_mean', 'act_page=1$user_id#', 'action_type=0$user_id#rate', 'action_type=1$user_id#rate', 'register_type', 'act$user_id_by_author_id_iq', 'act$user_id_by_video_id_iq', 'videorate', 'act_page=1$user_id#rate', 'act$author_video_m', 'action_type=2$user_id#rate', 'act_page=3$user_id#rate', 'act_page=0$user_id#', 'action_type=0$user_id#', 'act_page=2$user_id#', 'act_page=2$user_id#rate', 'action_type=1$user_id#', 'act$user_id#', 'act_page=4$user_id#rate', 'act_page=0$user_id#rate', 'pageall', 'act_page=4$user_id#', 'action_type=3$user_id#rate', 'act_page=23$user_id#', 'act_page=3$user_id#', 'video$user_id#', 'action_type=2$user_id#', 'action_type=3$user_id#', 'act_page=023$user_id#', 'act$author_id#', 'action_type=01$user_id#', 'action_type=5$user_id#rate', 'ALL$register_type#', 'action_type=5$user_id#', 'act$user_id#10', 'action_type=4$user_id#', 'actionall', 'action_type=4$user_id#rate', 'act0', 'video0' ] ccc1 = [] ddd = [ 'All$register_type_by_act$user_id#_mean', 'act_page=1$user_id#', 'action_type=1$user_id#rate', 'act$user_id_by_author_id_iq', 'act$user_id_by_video_id_iq', 'act$author_video_m', 'act_page=2$user_id#', 'act_page=2$user_id#rate', 'action_type=1$user_id#', 'act$user_id#', 'act_page=4$user_id#rate', 'act_page=4$user_id#', 'action_type=3$user_id#rate', 'act_page=23$user_id#', 'act_page=3$user_id#', 'video$user_id#', 'action_type=2$user_id#', 'action_type=3$user_id#', 'act$author_id#', 'action_type=01$user_id#', 'ALL$register_type#', 'ALL$device_type#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'act$user_id#10', 'action_type=4$user_id#', 'actionall', 'action_type=4$user_id#rate', 'act0', ] used = [ 'device_type', 'register_type', 'actrate', 'action_type=0$user_id#rate', 'videorate', 'act_page=1$user_id#rate', 'action_type=2$user_id#rate', 'act_page=3$user_id#rate', 'act_page=0$user_id#', 'action_type=0$user_id#', 'act_page=0$user_id#rate', 'pageall', 'act_page=023$user_id#', 'video0', 'All$register_type_by_act$user_id#_mean', 'ALL$register_type#', ] df = df[used] df1 = df[lendf:] df = df[:lendf] return df, df1
def get_features0(df, d): #tapp = app[app.day==d] tvideo = video[video.day == d] tact = act[act.day == d] #df = docount(df,tapp,'app',['user_id']);gc.collect() df = docount(df, tvideo, 'video', ['user_id']) gc.collect() df['videorate'] = df['video$user_id#'] / (tvideo.shape[0] + 0.000001) df = docount(df, tact, 'act', ['user_id']) gc.collect() df['actrate'] = df['act$user_id#'] / (tact.shape[0] + 0.000001) page_list = list(tact['page'].unique()) for c in [0, 1, 2, 3, 4]: df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c), ['user_id']) gc.collect() df['act_page=' + str(c) + '$user_id#rate'] = df['act_page=' + str(c) + '$user_id#'] / ( df['act$user_id#'] + 0.00001) df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df[ 'act_page=3$user_id#'] df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df[ 'act_page=3$user_id#'] + df['act_page=0$user_id#'] action_list = list(tact['action_type'].unique()) for c in [0, 1, 2, 3, 4, 5]: df = docount(df, tact[tact['action_type'] == c], 'action_type=' + str(c), ['user_id']) gc.collect() df['action_type=' + str(c) + '$user_id#rate'] = df['action_type=' + str(c) + '$user_id#'] / ( df['act$user_id#'] + 0.00001) df['action_type=01$user_id#'] = df['action_type=0$user_id#'] + df[ 'action_type=1$user_id#'] def iszero(s): if s == 0: return 0 return 1 df['pageall'] = df['act_page=0$user_id#'].apply(iszero) for c in [1, 2, 3, 4]: df['pageall'] = df['pageall'] * df['act_page=0$user_id#'] df['pageall'] = df['act_page=0$user_id#'].apply(iszero) df['actionall'] = df['action_type=0$user_id#'].apply(iszero) for c in [1, 2, 3, 4, 5]: df['pageall'] = df['pageall'] * df['action_type=0$user_id#'] df['actionall'] = df['action_type=0$user_id#'].apply(iszero) df['act0'] = df['act$user_id#'].apply(iszero) df['video0'] = df['video$user_id#'].apply(iszero) def bigact(s): if s >= 50: return 5 else: return int(s / 10) df['act$user_id#10'] = df['act$user_id#'].apply(bigact) df['author_id'] = df['user_id'] df = docount(df, tact, 'act', ['author_id']) gc.collect() df = doiq(df, tact, 'act', ['user_id'], 'video_id') gc.collect() df = doiq(df, tact, 'act', ['user_id'], 'author_id') gc.collect() df['act$author_video_m'] = df['act$user_id_by_video_id_iq'] / df[ 'act$user_id_by_author_id_iq'] del df['register_day'], df['author_id'] return df
def get_features(df,d1,d2): tapp = app[(app.day>=d1) & (app.day<=d2)] tact = act[(act.day>=d1) & (act.day<=d2)] tvideo = video[(video.day>=d1) & (video.day<=d2)] tapp.day = tapp.day - d1 tact.day = tact.day - d1 tvideo.day = tvideo.day - d1 lastday = d2-d1 #df['register_time'] = d2-df.register_day+1 df = docount(df,tapp,'app',['user_id']);gc.collect() df = docount(df,tapp[tapp.day==lastday],'last_day_app',['user_id']);gc.collect() #df['app_mean#'] = df['app$user_id#']/2 df = docount(df,tvideo,'video',['user_id']);gc.collect() df['videorate'] = df['video$user_id#']/(tvideo.shape[0]+0.000001) #df['video_mean#'] = df['video$user_id#']/2 df = docount(df,tact,'act',['user_id']);gc.collect() df = docount(df,tact[tact.day==lastday],'last_day_act',['user_id']);gc.collect() df = docount(df,tact[tact.day==lastday-1],'first_day_act',['user_id']);gc.collect() df['actrate'] = df['act$user_id#']/(tact.shape[0]+0.000001) df['last_day_actrate'] = df['last_day_act$user_id#']/(tact.shape[0]+0.000001) df['first_day_actrate'] = df['first_day_act$user_id#']/(tact.shape[0]+0.000001) df['actrate_gap'] = df['last_day_actrate'] - df['first_day_actrate'] df['act_gap'] = df['last_day_act$user_id#'] - df['first_day_act$user_id#'] #df['act_mean#'] = df['act$user_id#']/2 #page_list = list(tact['page'].unique()) def iszero(s): if s==0: return 0 return 1 df['act0'] = df['act$user_id#'].apply(iszero) df['video0'] = df['video$user_id#'].apply(iszero) for c in [1]: df = docount(df,tact[tact.day==lastday][tact['page']==c],'last_day_act_page='+str(c),['user_id']);gc.collect() for c in [0,1,2,3,4]: df = docount(df,tact[tact['page']==c],'act_page='+str(c),['user_id']);gc.collect() df['act_page='+str(c)+'$user_id#rate'] = df['act_page='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001) df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#'] df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#']+df['act_page=0$user_id#'] action_list = list(tact['action_type'].unique()) for c in [0,1,2,3,4,5]: df = docount(df,tact[tact['action_type']==c],'action_type='+str(c),['user_id']);gc.collect() df = docount(df,tact[tact.day==lastday][tact['action_type']==c],'last_day_action_type='+str(c),['user_id']);gc.collect() df['action_type='+str(c)+'$user_id#rate'] = df['action_type='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001) df['author_id'] = df['user_id'] df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'video_id');gc.collect() df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'author_id');gc.collect() df['last_day_act$author_video_m'] = df['last_day_act$user_id_by_video_id_iq']/df['last_day_act$user_id_by_author_id_iq'] df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'video_id');gc.collect() df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'author_id');gc.collect() df['first_day_act$author_video_m'] = df['first_day_act$user_id_by_video_id_iq']/df['first_day_act$user_id_by_author_id_iq'] df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'video_id');gc.collect() df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'author_id');gc.collect() df['last2_day_act$author_video_m'] = df['last2_day_act$user_id_by_video_id_iq']/df['last2_day_act$user_id_by_author_id_iq'] del df['register_day'],df['author_id'] return df
def get_features(df, d1, d2): tapp = app[(app.day >= d1) & (app.day <= d2)] tact = act[(act.day >= d1) & (act.day <= d2)] tvideo = video[(video.day >= d1) & (video.day <= d2)] tapp.day = tapp.day - d1 tact.day = tact.day - d1 tvideo.day = tvideo.day - d1 lastday = d2 - d1 #app df = docount(df, tapp, 'app', ['user_id']) #df = domin(df,tapp,'app',['user_id'],'day') df = domax(df, tapp, 'app', ['user_id'], 'day') df['last_app_day'] = lastday - df['app$user_id_by_day_max'] + 1 #df['app_day_gap'] = df['app$user_id_by_day_max']- df['app$user_id_by_day_min']+1 df['app_day_missing'] = df['register_time'] - df['app$user_id#'] df['app_mean#'] = df['app$user_id#'] / df['register_time'] del df['app$user_id#'], df['app$user_id_by_day_max'] df = dovar(df, tapp, 'app', ['user_id'], 'day') #df = domean(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day') #df = dovar(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day') for i in range(8): df = docount(df, tapp[tapp.day >= lastday - i], 'app_last_' + str(i), ['user_id']) if i >= 3: df = domean(df, tapp[tapp.day >= lastday - i], 'app_last_' + str(i), ['user_id'], 'day') df = dovar(df, tapp[tapp.day >= lastday - i], 'app_last_' + str(i), ['user_id'], 'day') #df = docount(df,tapp[tapp.day>lastday-7],'app_last_7',['user_id']) #df = docount(df,tapp[tapp.day>lastday-3],'app_last_3',['user_id']) #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id']) gc.collect() #video df = docount(df, tvideo, 'video', ['user_id']) df = domin(df, tvideo, 'video', ['user_id'], 'day') df = domax(df, tvideo, 'video', ['user_id'], 'day') df = doiq(df, tvideo, 'video', ['user_id'], 'day') df['last_video_day'] = lastday - df['video$user_id_by_day_max'] + 1 df['first_video_day'] = lastday - df['video$user_id_by_day_min'] + 1 df['video_day_gap'] = df['video$user_id_by_day_max'] - df[ 'video$user_id_by_day_min'] + 1 #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq'] df['video_mean#'] = df['video$user_id#'] / df['register_time'] del df['video$user_id#'], df['video$user_id_by_day_max'], df[ 'video$user_id_by_day_min'] df = dovar(df, tvideo, 'video', ['user_id'], 'day') df = domean(df, tvideo[tvideo.day > lastday - 8], 'video_last_8', ['user_id'], 'day') df = dovar(df, tvideo[tvideo.day > lastday - 8], 'video_last_8', ['user_id'], 'day') df = docount(df, tvideo[tvideo.day > lastday - 8], 'video_last_8', ['user_id']) #df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id']) #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id']) gc.collect() #act gp = tact.groupby(['user_id', 'day']).size().unstack() df = pd.merge(df, gp.max(1).rename('actcount_max').reset_index(), on=['user_id'], how='left') df = pd.merge(df, gp.mean(1).rename('actcount_mean').reset_index(), on=['user_id'], how='left') df = pd.merge(df, gp.var(1).rename('actcount_var').reset_index(), on=['user_id'], how='left') df = docount(df, tact, 'act', ['user_id']) df = domin(df, tact, 'act', ['user_id'], 'day') df = domax(df, tact, 'act', ['user_id'], 'day') df = doiq(df, tact, 'act', ['user_id'], 'day') #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1 df['act_day_gap'] = df['act$user_id_by_day_max'] - df[ 'act$user_id_by_day_min'] + 1 df['act_day_missing'] = df['register_time'] - df[ 'act$user_id_by_day_iq'] df['act_mean#'] = df['act$user_id#'] / df['register_time'] del df['act$user_id#'] df = dovar(df, tact, 'act', ['user_id'], 'day') #df = domean(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day') #df = dovar(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day') for i in range(8): df = docount(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id']) if i >= 3: df = domean(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'day') df = dovar(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'day') gp = tact[tact.day >= lastday - i].groupby( ['user_id', 'day']).size().unstack() df = pd.merge(df, gp.max(1).rename('act_last_' + str(i) + '_actcount_max').reset_index(), on=['user_id'], how='left') df = pd.merge( df, gp.mean(1).rename('act_last_' + str(i) + '_actcount_mean').reset_index(), on=['user_id'], how='left') df = pd.merge(df, gp.var(1).rename('act_last_' + str(i) + '_actcount_var').reset_index(), on=['user_id'], how='left') #df = docount(df,tact[tact.day>lastday-7],'act_last_7',['user_id']) #df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id']) #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id']) gc.collect() page_list = list(tact['page'].unique()) for c in page_list: df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c), ['user_id']) df['act_page=' + str(c) + '$user_id#'] = df['act_page=' + str(c) + '$user_id#'] / df['register_time'] for c in page_list: df = docount(df, tact[(tact['page'] == c) & (tact.day > lastday - 8)], 'act_last_8_page=' + str(c), ['user_id']) for c in page_list: df = docount(df, tact[(tact['page'] == c) & (tact.day > lastday - 3)], 'act_last_3_page=' + str(c), ['user_id']) df['author_id'] = df['user_id'] df = docount(df, tact, 'act', ['author_id']) df['act$author_id#'] = df['act$author_id#'] / df['register_time'] df = doiq(df, tact, 'act', ['user_id'], 'author_id') df['act$user_id_by_author_id_iq'] = df[ 'act$user_id_by_author_id_iq'] / df['register_time'] df = doiq(df, tact, 'act', ['user_id'], 'video_id') df['act$user_id_by_video_id_iq'] = df[ 'act$user_id_by_video_id_iq'] / df['register_time'] for i in range(8): df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'author_id') df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'video_id') #action_list = list(tact['action_type'].unique()) for c in [0, 1, 2, 3, 5]: df = docount(df, tact[tact['action_type'] == c], 'action_type=' + str(c), ['user_id']) gc.collect() df['action_type=' + str(c) + '$user_id#'] = df['action_type=' + str(c) + '$user_id#'] / df['register_time'] for c in [0, 1, 2, 3]: df = docount( df, tact[(tact['action_type'] == c) & (tact.day > lastday - 8)], 'act_last_8_action_type=' + str(c), ['user_id']) for c in [0, 1, 2, 3]: df = docount( df, tact[(tact['action_type'] == c) & (tact.day > lastday - 3)], 'act_last_3_action_type=' + str(c), ['user_id']) ''' def getmaxcontinuedays(s): s = np.array(s) ans = 0 t = 0 for i in s: if i>0: t = t+ 1 else: if t>ans: ans = t t = 0 if t>ans: ans=t return ans gp = tapp.groupby(['user_id','day']).size().unstack() gp = gp.fillna(0) #print (gp) gp['app_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1) #print (gp) df = pd.merge(df,gp.reset_index()[['user_id','app_max_continue_days']],on=['user_id'],how='left') gp = tact.groupby(['user_id','day']).size().unstack() gp = gp.fillna(0) #print (gp) gp['act_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1) #print (gp) df = pd.merge(df,gp.reset_index()[['user_id','act_max_continue_days']],on=['user_id'],how='left') ''' del df['author_id'] gc.collect() return df
def get_features(df, ed): df['register_time'] = ed - df.register_day + 1 del df['register_day'] tapp = app[app.day <= ed] tact = act[act.day <= ed] tvideo = video[video.day <= ed] tapp['time'] = ed - tapp.day tact['time'] = ed - tact.day tvideo['time'] = ed - tvideo.day if get_author_feature: gp = tact[(tact.time < 14) & (tact.ranks <= 500)].groupby( ['user_id', 'ranks']).size().unstack().reset_index() cols = list(gp.columns) for i in range(1, 501): if i not in cols: print(i) gp[i] = 0 for i in range(1, 501): gp['a' + str(i)] = gp[i] del gp[i] df = df.merge(gp, on=['user_id'], how='left') df = docount(df, tapp, 'app', ['user_id']) df = domin(df, tapp, 'app', ['user_id'], 'day') df = dovar(df, tapp, 'app', ['user_id'], 'day') #df = domax(df,tapp,'app',['user_id'],'day') #df['app_day_gap'] = df['app$user_id_by_day_max']- df['app$user_id_by_day_min'] df['app_rate'] = df['app$user_id#'] / df['register_time'] df = docount(df, tvideo, 'video', ['user_id']) df = domin(df, tvideo, 'video', ['user_id'], 'day') df = doiq(df, tvideo, 'video', ['user_id'], 'day') df = doiq(df, tvideo[tvideo.time < 16], 'video16', ['user_id'], 'day') df['video_rate'] = df['video$user_id_by_day_iq'] / df['register_time'] df['video_rate1'] = df['video$user_id_by_day_iq'] / df['app$user_id#'] df['video_mean'] = df['video$user_id#'] / df['register_time'] df['video_mean1'] = df['video$user_id#'] / df['app$user_id#'] df['video_mean2'] = df['video$user_id#'] / df['video$user_id_by_day_iq'] df = docount(df, tact, 'act', ['user_id']) df = domin(df, tact, 'act', ['user_id'], 'day') df = doiq(df, tact, 'act', ['user_id'], 'day') df = doiq(df, tact[tact.time < 16], 'act16', ['user_id'], 'day') df['act_rate'] = df['act$user_id_by_day_iq'] / df['register_time'] df['act_rate1'] = df['act$user_id_by_day_iq'] / df['act$user_id#'] df['act_mean'] = df['act$user_id#'] / df['register_time'] df['act_mean1'] = df['act$user_id#'] / df['app$user_id#'] df['act_mean2'] = df['act$user_id#'] / df['act$user_id_by_day_iq'] #df = docount(df,tapp[(tapp.time<14)&(tapp.weekend==1)],'app14_weekend',['user_id']) df = docount(df, tapp[(tapp.time < 7) & (tapp.weekend == 1)], 'app7_weekend', ['user_id']) #df = docount(df,tvideo[(tvideo.time<14)&(tvideo.weekend==1)],'video14_weekend',['user_id']) #df = docount(df,tvideo[(tvideo.time<7)&(tvideo.weekend==1)],'video7_weekend',['user_id']) #df = docount(df,tact[(tact.time<14)&(tact.weekend==1)],'act14_weekend',['user_id']) #df = docount(df,tact[(tact.time<7)&(tact.weekend==1)],'act7_weekend',['user_id']) #df = doiq(df,tact[(tact.time<16)&(tact.ranks<50)],'act16_top50',['user_id'],'author_id') #df = doiq(df,tact[(tact.time<16)&(tact.ranks<100)],'act16_top100',['user_id'],'author_id') df = doiq(df, tact[(tact.time < 16) & (tact.ranks < 500)], 'act16_top500', ['user_id'], 'author_id') df = doiq(df, tact[(tact.time < 16) & (tact.ranks < 500)], 'act16_top500', ['user_id'], 'video_id') #df = docount(df,tact[(tact.time<16)&(tact.ranks<10)],'act16_top10',['user_id']) #df = docount(df,tact[(tact.time<16)&(tact.ranks<100)],'act16_top100',['user_id']) #df = docount(df,tact[(tact.time<16)&(tact.ranks<500)],'act16_top500',['user_id']) for i in range(2, 7): gp = tapp[tapp.time <= i][['user_id']].groupby([ 'user_id' ]).size().rename('last_' + str(i) + '_days_app#').reset_index() df = pd.merge(df, gp, on=['user_id'], how='left') for i in range(2, 7): gp = tact[tact.time <= i][['user_id']].groupby([ 'user_id' ]).size().rename('last_' + str(i) + '_days_act#').reset_index() df = pd.merge(df, gp, on=['user_id'], how='left') gp = tact[['user_id', 'author_id']].groupby([ 'user_id' ])['author_id'].nunique().rename('act_author_id_u#').reset_index() df = pd.merge(df, gp, on=['user_id'], how='left') df['act_author_id_u_mean#'] = df['act_author_id_u#'] / df['register_time'] gp = tact[['user_id', 'video_id']].groupby([ 'user_id' ])['video_id'].nunique().rename('act_video_id_u#').reset_index() df = pd.merge(df, gp, on=['user_id'], how='left') df['act_video_id_u_mean#'] = df['act_video_id_u#'] / df['register_time'] df['video_author_m'] = df['act_video_id_u#'] / df['act_author_id_u#'] df['act_author_id_u_mean1#'] = df['act_author_id_u#'] / df[ 'act$user_id_by_day_iq'] df['act_video_id_u_mean1#'] = df['act_video_id_u#'] / df[ 'act$user_id_by_day_iq'] for i in [3, 7, 14]: for c in [0, 1, 2, 3]: gp = tact[tact['time'] < i][tact['page'] == c][[ 'user_id' ]].groupby(['user_id' ]).size().rename('act_' + str(i) + '_author_page_' + str(c) + '_u#').reset_index() df = pd.merge(df, gp, on=['user_id'], how='left') for i in [3, 7, 14]: for c in [0, 1, 2, 3]: gp = tact[tact['time'] < i][tact['action_type'] == c][[ 'user_id' ]].groupby([ 'user_id' ]).size().rename('act_' + str(i) + '_author_action_type_' + str(c) + '_u#').reset_index() df = pd.merge(df, gp, on=['user_id'], how='left') def get_last_gap(s): s = list(s) n = len(s) if n > 1: s.sort() return s[n - 1] - s[n - 2] return None gp = tapp[tapp['time'] < 16].groupby(['user_id'])['day'].unique().apply( get_last_gap).rename('app_last_gap').reset_index() df = pd.merge(df, gp, on=['user_id'], how='left') gp = tvideo[tvideo['time'] < 16].groupby( ['user_id'])['day'].unique().apply(get_last_gap).rename( 'video_last_gap').reset_index() df = pd.merge(df, gp, on=['user_id'], how='left') gp = tact[tact['time'] < 16].groupby(['user_id'])['day'].unique().apply( get_last_gap).rename('act_last_gap').reset_index() df = pd.merge(df, gp, on=['user_id'], how='left') df['author_id'] = df['user_id'] gp = tact[tact['time'] < 16][['author_id']].groupby( ['author_id']).size().rename('author#').reset_index() df = pd.merge(df, gp, on=['author_id'], how='left') for i in [7, 14]: for c in [0, 1, 2, 3]: gp = tact[tact['time'] < i][tact['action_type'] == c][[ 'author_id' ]].groupby([ 'author_id' ]).size().rename('act_' + str(i) + '_author_action_type_' + str(c) + '_a#').reset_index() df = pd.merge(df, gp, on=['author_id'], how='left') for c in [1, 2, 3, 4]: gp = tact[tact['time'] < 16][tact['page'] == c][['author_id']].groupby( ['author_id' ]).size().rename('author_act_page_' + str(c) + '#').reset_index() df = pd.merge(df, gp, on=['author_id'], how='left') del df['author_id'] del df['app$user_id#'], df['video$user_id#'], df['act$user_id#'], df[ 'act_author_id_u#'], df['act_video_id_u#'] del df['act$user_id_by_day_iq'], df['video$user_id_by_day_iq'] for i in range(16): gp = tapp[tapp.time == i].groupby( ['user_id']).size().rename('app_' + str(i)).reset_index() df = df.merge(gp, on=['user_id'], how='left') for i in range(16): gp = tvideo[tvideo.time == i].groupby( ['user_id']).size().rename('video_count_' + str(i)).reset_index() df = df.merge(gp, on=['user_id'], how='left') for i in range(16): gp = tact[tact.time == i].groupby( ['user_id']).size().rename('act_count_' + str(i)).reset_index() df = df.merge(gp, on=['user_id'], how='left') return df
def get_features(df,d1,d2): tapp = app[(app.day>=d1) & (app.day<=d2)] tact = act[(act.day>=d1) & (act.day<=d2)] tvideo = video[(video.day>=d1) & (video.day<=d2)] tapp.day = tapp.day - d1 tact.day = tact.day - d1 tvideo.day = tvideo.day - d1 lastday = d2-d1 df['register_time'] = d2-df.register_day+1 del df['register_day'] #app df = docount(df,tapp,'app',['user_id']) df['app_mean#'] = df['app$user_id#']/df['register_time'] #df = domax(df,tapp,'app',['user_id'],'day') #df['last_app_day'] = lastday - df['app$user_id_by_day_max']+1 del df['app$user_id#'] #df['app_day_missing'] = df['register_time'] - df['app$user_id#'] #df['app$user_id#'] = df['app$user_id#']/df['register_time'] #df = dovar(df,tapp,'app',['user_id'],'day') #df = docount(df,tapp[tapp.day>lastday-2],'app_last_2',['user_id']) #df = docount(df,tapp[tapp.day>lastday-1],'app_last_1',['user_id']) #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id']) gc.collect() #video #df = docount(df,tvideo,'video',['user_id']) #df['video_mean#'] = df['video$user_id#']/df['register_time'] #df = domax(df,tvideo,'video',['user_id'],'day') #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1 #del df['video$user_id_by_day_max'] #df = doiq(df,tvideo,'video',['user_id'],'day') #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1 #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq'] #df['video$user_id#'] = df['video$user_id#']/df['register_time'] #df = dovar(df,tvideo,'video',['user_id'],'day') df = docount(df,tvideo[tvideo.day>lastday-2],'video_last_2',['user_id']) df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id']) #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id']) gc.collect() #act #gp = act.groupby(['user_id','day']).size().unstack() #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left') #df = docount(df,tact,'act',['user_id']) #df['act_mean#'] = df['act$user_id#']/df['register_time'] df = domax(df,tact,'act',['user_id'],'day') df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1 del df['act$user_id_by_day_max'] #df = doiq(df,tact,'act',['user_id'],'day') #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1 #df['act_day_missing'] = df['register_time'] - df['act$user_id_by_day_iq'] #df['act$user_id#'] = df['act$user_id#']/df['register_time'] #gp = tact.groupby(['user_id','day']).size().unstack() #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left') #df = dovar(df,tact,'act',['user_id'],'day') df = docount(df,tact[tact.day>lastday-2],'act_last_2',['user_id']) df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id']) #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id']) gc.collect() #page_list = list(tact['page'].unique()) for c in [0,1,2,3]: df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-3)],'act_last_3_page='+str(c),['user_id']) df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-2)],'act_last_2_page='+str(c),['user_id']) df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-1)],'act_last_1_page='+str(c),['user_id']) df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'author_id') df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'video_id') df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'author_id') df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'video_id') df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'author_id') df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'video_id') for c in [0,1,2,3]: df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-3)],'act_last_3_action_type='+str(c),['user_id']) df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-2)],'act_last_2_action_type='+str(c),['user_id']) df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-1)],'act_last_1_action_type='+str(c),['user_id']) gc.collect() return df