Пример #1
0
    def get_features_all(df,df1):
        lendf = len(df)
        df= df.append(df1)
        del df1
        gc.collect()
        
        df = docount(df,df,'ALL',['register_type'])
        df = docount(df,df,'ALL',['device_type'])
        
        
        del df['user_id']

        df1 = df[lendf:]
        df = df[:lendf]
        return df,df1
Пример #2
0
    def get_features_all(df,df1):
        lendf = len(df)
        df= df.append(df1)
        del df1
        gc.collect()
        df = docount(df,df,'ALL',['register_type']) 

        del df['user_id']
        
        ccc = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#', 'first_day_act$user_id_by_author_id_iq', 'action_type=2$user_id#rate', 'act_page=0$user_id#rate', 'last_day_act$author_video_m', 'action_type=1$user_id#rate', 'act_page=2$user_id#', 'actrate', 'last_day_act$user_id_by_author_id_iq', 'app$user_id#', 'last_day_act_page=1$user_id#', 'act_page=3$user_id#rate', 'last_day_action_type=0$user_id#', 'first_day_act$user_id_by_video_id_iq', 'videorate', 'act_page=1$user_id#rate', 'last2_day_act$user_id_by_author_id_iq', 'last2_day_act$user_id_by_video_id_iq', 'first_day_actrate', 'act_page=2$user_id#rate', 'last_day_actrate', 'first_day_act$author_video_m', 'last2_day_act$author_video_m', 'ALL$register_type#', 'act_page=0$user_id#', 'actrate_gap', 'action_type=3$user_id#rate', 'last_day_act$user_id#', 'act$user_id#', 'last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 'action_type=1$user_id#', 'act_gap', 'action_type=2$user_id#', 'action_type=3$user_id#', 'first_day_act$user_id#', 'act_page=3$user_id#', 'act_page=4$user_id#rate', 'video$user_id#', 'last_day_action_type=1$user_id#', 'act_page=23$user_id#', 'act_page=023$user_id#', 'act_page=4$user_id#', 'last_day_action_type=2$user_id#', 'last_day_action_type=3$user_id#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'last_day_app$user_id#', 'last_day_action_type=4$user_id#', 'action_type=4$user_id#', 'last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0']
        ccc1 = [ ]
        
        ddd = ['action_type=2$user_id#rate','action_type=1$user_id#rate','last_day_act$user_id_by_author_id_iq',
               'last_day_act_page=1$user_id#','act_page=3$user_id#rate','first_day_act$user_id_by_video_id_iq',
               'videorate','act_page=1$user_id#rate','last2_day_act$user_id_by_author_id_iq','last2_day_act$user_id_by_video_id_iq',
               'act_page=2$user_id#rate','last_day_actrate', 'first_day_act$author_video_m','last2_day_act$author_video_m',
               'ALL$register_type#','act_page=0$user_id#','actrate_gap','action_type=3$user_id#rate',
               'last_day_act$user_id#','act$user_id#','last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 
               'action_type=1$user_id#','act_gap', 'action_type=2$user_id#','action_type=3$user_id#',
               'first_day_act$user_id#', 'act_page=3$user_id#','act_page=4$user_id#rate', 'video$user_id#', 
               'last_day_action_type=1$user_id#','act_page=23$user_id#', 'act_page=023$user_id#','act_page=4$user_id#', 
               'last_day_action_type=2$user_id#','last_day_action_type=3$user_id#', 'action_type=5$user_id#rate',
               'action_type=5$user_id#', 'last_day_app$user_id#','last_day_action_type=4$user_id#',
               'action_type=4$user_id#','last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0']
        
        used = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#',
                'first_day_act$user_id_by_author_id_iq', 'act_page=0$user_id#rate','last_day_act$author_video_m',
                'act_page=2$user_id#','actrate','app$user_id#', 'last_day_action_type=0$user_id#',
                'first_day_actrate', 'action_type=5$user_id#rate', ]
        
        df = df[used]
        
        
         
        df1 = df[lendf:]
        df = df[:lendf]
        return df,df1
Пример #3
0
    def get_features_all(df, df1):
        lendf = len(df)
        df = df.append(df1)
        del df1
        gc.collect()

        for c in ['act$user_id#']:
            #df = domean(df,df,'All',['device_type'],c);gc.collect()
            df = domean(df, df, 'All', ['register_type'], c)
            gc.collect()
            #df = dovar(df,df,'All',['register_type'],c);gc.collect()
        df = docount(df, df, 'ALL', ['register_type'])
        df = docount(df, df, 'ALL', ['device_type'])

        del df['user_id'],

        ccc = [
            'device_type', 'actrate', 'All$register_type_by_act$user_id#_mean',
            'act_page=1$user_id#', 'action_type=0$user_id#rate',
            'action_type=1$user_id#rate', 'register_type',
            'act$user_id_by_author_id_iq', 'act$user_id_by_video_id_iq',
            'videorate', 'act_page=1$user_id#rate', 'act$author_video_m',
            'action_type=2$user_id#rate', 'act_page=3$user_id#rate',
            'act_page=0$user_id#', 'action_type=0$user_id#',
            'act_page=2$user_id#', 'act_page=2$user_id#rate',
            'action_type=1$user_id#', 'act$user_id#',
            'act_page=4$user_id#rate', 'act_page=0$user_id#rate', 'pageall',
            'act_page=4$user_id#', 'action_type=3$user_id#rate',
            'act_page=23$user_id#', 'act_page=3$user_id#', 'video$user_id#',
            'action_type=2$user_id#', 'action_type=3$user_id#',
            'act_page=023$user_id#', 'act$author_id#',
            'action_type=01$user_id#', 'action_type=5$user_id#rate',
            'ALL$register_type#', 'action_type=5$user_id#', 'act$user_id#10',
            'action_type=4$user_id#', 'actionall',
            'action_type=4$user_id#rate', 'act0', 'video0'
        ]
        ccc1 = []

        ddd = [
            'All$register_type_by_act$user_id#_mean',
            'act_page=1$user_id#',
            'action_type=1$user_id#rate',
            'act$user_id_by_author_id_iq',
            'act$user_id_by_video_id_iq',
            'act$author_video_m',
            'act_page=2$user_id#',
            'act_page=2$user_id#rate',
            'action_type=1$user_id#',
            'act$user_id#',
            'act_page=4$user_id#rate',
            'act_page=4$user_id#',
            'action_type=3$user_id#rate',
            'act_page=23$user_id#',
            'act_page=3$user_id#',
            'video$user_id#',
            'action_type=2$user_id#',
            'action_type=3$user_id#',
            'act$author_id#',
            'action_type=01$user_id#',
            'ALL$register_type#',
            'ALL$device_type#',
            'action_type=5$user_id#rate',
            'action_type=5$user_id#',
            'act$user_id#10',
            'action_type=4$user_id#',
            'actionall',
            'action_type=4$user_id#rate',
            'act0',
        ]

        used = [
            'device_type',
            'register_type',
            'actrate',
            'action_type=0$user_id#rate',
            'videorate',
            'act_page=1$user_id#rate',
            'action_type=2$user_id#rate',
            'act_page=3$user_id#rate',
            'act_page=0$user_id#',
            'action_type=0$user_id#',
            'act_page=0$user_id#rate',
            'pageall',
            'act_page=023$user_id#',
            'video0',
            'All$register_type_by_act$user_id#_mean',
            'ALL$register_type#',
        ]

        df = df[used]

        df1 = df[lendf:]
        df = df[:lendf]
        return df, df1
Пример #4
0
    def get_features0(df, d):
        #tapp = app[app.day==d]
        tvideo = video[video.day == d]
        tact = act[act.day == d]
        #df = docount(df,tapp,'app',['user_id']);gc.collect()
        df = docount(df, tvideo, 'video', ['user_id'])
        gc.collect()
        df['videorate'] = df['video$user_id#'] / (tvideo.shape[0] + 0.000001)
        df = docount(df, tact, 'act', ['user_id'])
        gc.collect()
        df['actrate'] = df['act$user_id#'] / (tact.shape[0] + 0.000001)

        page_list = list(tact['page'].unique())
        for c in [0, 1, 2, 3, 4]:
            df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c),
                         ['user_id'])
            gc.collect()
            df['act_page=' + str(c) +
               '$user_id#rate'] = df['act_page=' + str(c) + '$user_id#'] / (
                   df['act$user_id#'] + 0.00001)

        df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df[
            'act_page=3$user_id#']
        df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df[
            'act_page=3$user_id#'] + df['act_page=0$user_id#']

        action_list = list(tact['action_type'].unique())
        for c in [0, 1, 2, 3, 4, 5]:
            df = docount(df, tact[tact['action_type'] == c],
                         'action_type=' + str(c), ['user_id'])
            gc.collect()
            df['action_type=' + str(c) +
               '$user_id#rate'] = df['action_type=' + str(c) + '$user_id#'] / (
                   df['act$user_id#'] + 0.00001)

        df['action_type=01$user_id#'] = df['action_type=0$user_id#'] + df[
            'action_type=1$user_id#']

        def iszero(s):
            if s == 0:
                return 0
            return 1

        df['pageall'] = df['act_page=0$user_id#'].apply(iszero)
        for c in [1, 2, 3, 4]:
            df['pageall'] = df['pageall'] * df['act_page=0$user_id#']
        df['pageall'] = df['act_page=0$user_id#'].apply(iszero)

        df['actionall'] = df['action_type=0$user_id#'].apply(iszero)
        for c in [1, 2, 3, 4, 5]:
            df['pageall'] = df['pageall'] * df['action_type=0$user_id#']
        df['actionall'] = df['action_type=0$user_id#'].apply(iszero)

        df['act0'] = df['act$user_id#'].apply(iszero)
        df['video0'] = df['video$user_id#'].apply(iszero)

        def bigact(s):
            if s >= 50:
                return 5
            else:
                return int(s / 10)

        df['act$user_id#10'] = df['act$user_id#'].apply(bigact)

        df['author_id'] = df['user_id']
        df = docount(df, tact, 'act', ['author_id'])
        gc.collect()
        df = doiq(df, tact, 'act', ['user_id'], 'video_id')
        gc.collect()
        df = doiq(df, tact, 'act', ['user_id'], 'author_id')
        gc.collect()

        df['act$author_video_m'] = df['act$user_id_by_video_id_iq'] / df[
            'act$user_id_by_author_id_iq']

        del df['register_day'], df['author_id']
        return df
Пример #5
0
    def get_features(df,d1,d2):
        tapp = app[(app.day>=d1) & (app.day<=d2)]
        tact = act[(act.day>=d1) & (act.day<=d2)]
        tvideo = video[(video.day>=d1) & (video.day<=d2)]
        tapp.day = tapp.day - d1
        tact.day = tact.day - d1
        tvideo.day = tvideo.day - d1
        lastday = d2-d1      
        #df['register_time'] = d2-df.register_day+1
        
        df = docount(df,tapp,'app',['user_id']);gc.collect() 
        df = docount(df,tapp[tapp.day==lastday],'last_day_app',['user_id']);gc.collect()
        #df['app_mean#'] = df['app$user_id#']/2
        df = docount(df,tvideo,'video',['user_id']);gc.collect()
        df['videorate'] = df['video$user_id#']/(tvideo.shape[0]+0.000001)
        #df['video_mean#'] = df['video$user_id#']/2
        df = docount(df,tact,'act',['user_id']);gc.collect()
        df = docount(df,tact[tact.day==lastday],'last_day_act',['user_id']);gc.collect()
        df = docount(df,tact[tact.day==lastday-1],'first_day_act',['user_id']);gc.collect()
        df['actrate'] = df['act$user_id#']/(tact.shape[0]+0.000001)
        df['last_day_actrate'] = df['last_day_act$user_id#']/(tact.shape[0]+0.000001)
        df['first_day_actrate'] = df['first_day_act$user_id#']/(tact.shape[0]+0.000001)
        df['actrate_gap'] = df['last_day_actrate'] - df['first_day_actrate']
        df['act_gap'] = df['last_day_act$user_id#'] - df['first_day_act$user_id#']
        #df['act_mean#'] = df['act$user_id#']/2
        #page_list = list(tact['page'].unique())
        def iszero(s):
            if s==0:
                return 0
            return 1
        df['act0'] = df['act$user_id#'].apply(iszero)
        df['video0'] = df['video$user_id#'].apply(iszero)    
        
        
        
        for c in [1]: 
            df = docount(df,tact[tact.day==lastday][tact['page']==c],'last_day_act_page='+str(c),['user_id']);gc.collect()
        
        for c in [0,1,2,3,4]: 
            df = docount(df,tact[tact['page']==c],'act_page='+str(c),['user_id']);gc.collect()
            df['act_page='+str(c)+'$user_id#rate'] = df['act_page='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001)
        
        df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#']
        df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#']+df['act_page=0$user_id#']

        
        action_list = list(tact['action_type'].unique())
        for c in [0,1,2,3,4,5]: 
            df = docount(df,tact[tact['action_type']==c],'action_type='+str(c),['user_id']);gc.collect()
            df = docount(df,tact[tact.day==lastday][tact['action_type']==c],'last_day_action_type='+str(c),['user_id']);gc.collect()
            df['action_type='+str(c)+'$user_id#rate'] = df['action_type='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001)


        df['author_id'] = df['user_id']
        
        df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'video_id');gc.collect()
        df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'author_id');gc.collect()
        df['last_day_act$author_video_m'] = df['last_day_act$user_id_by_video_id_iq']/df['last_day_act$user_id_by_author_id_iq']
        
        df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'video_id');gc.collect()
        df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'author_id');gc.collect()
        df['first_day_act$author_video_m'] = df['first_day_act$user_id_by_video_id_iq']/df['first_day_act$user_id_by_author_id_iq']

        
        df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'video_id');gc.collect()
        df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'author_id');gc.collect()
        df['last2_day_act$author_video_m'] = df['last2_day_act$user_id_by_video_id_iq']/df['last2_day_act$user_id_by_author_id_iq']

        
        
        del df['register_day'],df['author_id']
        return df
Пример #6
0
    def get_features(df, d1, d2):
        tapp = app[(app.day >= d1) & (app.day <= d2)]
        tact = act[(act.day >= d1) & (act.day <= d2)]
        tvideo = video[(video.day >= d1) & (video.day <= d2)]
        tapp.day = tapp.day - d1
        tact.day = tact.day - d1
        tvideo.day = tvideo.day - d1
        lastday = d2 - d1
        #app
        df = docount(df, tapp, 'app', ['user_id'])
        #df = domin(df,tapp,'app',['user_id'],'day')
        df = domax(df, tapp, 'app', ['user_id'], 'day')

        df['last_app_day'] = lastday - df['app$user_id_by_day_max'] + 1
        #df['app_day_gap'] = df['app$user_id_by_day_max']- df['app$user_id_by_day_min']+1
        df['app_day_missing'] = df['register_time'] - df['app$user_id#']
        df['app_mean#'] = df['app$user_id#'] / df['register_time']
        del df['app$user_id#'], df['app$user_id_by_day_max']

        df = dovar(df, tapp, 'app', ['user_id'], 'day')
        #df = domean(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day')
        #df = dovar(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day')

        for i in range(8):
            df = docount(df, tapp[tapp.day >= lastday - i],
                         'app_last_' + str(i), ['user_id'])
            if i >= 3:
                df = domean(df, tapp[tapp.day >= lastday - i],
                            'app_last_' + str(i), ['user_id'], 'day')
                df = dovar(df, tapp[tapp.day >= lastday - i],
                           'app_last_' + str(i), ['user_id'], 'day')
        #df = docount(df,tapp[tapp.day>lastday-7],'app_last_7',['user_id'])
        #df = docount(df,tapp[tapp.day>lastday-3],'app_last_3',['user_id'])
        #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id'])

        gc.collect()
        #video
        df = docount(df, tvideo, 'video', ['user_id'])
        df = domin(df, tvideo, 'video', ['user_id'], 'day')
        df = domax(df, tvideo, 'video', ['user_id'], 'day')
        df = doiq(df, tvideo, 'video', ['user_id'], 'day')
        df['last_video_day'] = lastday - df['video$user_id_by_day_max'] + 1
        df['first_video_day'] = lastday - df['video$user_id_by_day_min'] + 1
        df['video_day_gap'] = df['video$user_id_by_day_max'] - df[
            'video$user_id_by_day_min'] + 1
        #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq']
        df['video_mean#'] = df['video$user_id#'] / df['register_time']
        del df['video$user_id#'], df['video$user_id_by_day_max'], df[
            'video$user_id_by_day_min']

        df = dovar(df, tvideo, 'video', ['user_id'], 'day')
        df = domean(df, tvideo[tvideo.day > lastday - 8], 'video_last_8',
                    ['user_id'], 'day')
        df = dovar(df, tvideo[tvideo.day > lastday - 8], 'video_last_8',
                   ['user_id'], 'day')

        df = docount(df, tvideo[tvideo.day > lastday - 8], 'video_last_8',
                     ['user_id'])
        #df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id'])
        #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id'])
        gc.collect()
        #act
        gp = tact.groupby(['user_id', 'day']).size().unstack()
        df = pd.merge(df,
                      gp.max(1).rename('actcount_max').reset_index(),
                      on=['user_id'],
                      how='left')
        df = pd.merge(df,
                      gp.mean(1).rename('actcount_mean').reset_index(),
                      on=['user_id'],
                      how='left')
        df = pd.merge(df,
                      gp.var(1).rename('actcount_var').reset_index(),
                      on=['user_id'],
                      how='left')

        df = docount(df, tact, 'act', ['user_id'])
        df = domin(df, tact, 'act', ['user_id'], 'day')
        df = domax(df, tact, 'act', ['user_id'], 'day')
        df = doiq(df, tact, 'act', ['user_id'], 'day')
        #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1
        df['act_day_gap'] = df['act$user_id_by_day_max'] - df[
            'act$user_id_by_day_min'] + 1
        df['act_day_missing'] = df['register_time'] - df[
            'act$user_id_by_day_iq']
        df['act_mean#'] = df['act$user_id#'] / df['register_time']
        del df['act$user_id#']

        df = dovar(df, tact, 'act', ['user_id'], 'day')
        #df = domean(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day')
        #df = dovar(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day')

        for i in range(8):
            df = docount(df, tact[tact.day >= lastday - i],
                         'act_last_' + str(i), ['user_id'])
            if i >= 3:
                df = domean(df, tact[tact.day >= lastday - i],
                            'act_last_' + str(i), ['user_id'], 'day')
                df = dovar(df, tact[tact.day >= lastday - i],
                           'act_last_' + str(i), ['user_id'], 'day')

                gp = tact[tact.day >= lastday - i].groupby(
                    ['user_id', 'day']).size().unstack()
                df = pd.merge(df,
                              gp.max(1).rename('act_last_' + str(i) +
                                               '_actcount_max').reset_index(),
                              on=['user_id'],
                              how='left')
                df = pd.merge(
                    df,
                    gp.mean(1).rename('act_last_' + str(i) +
                                      '_actcount_mean').reset_index(),
                    on=['user_id'],
                    how='left')
                df = pd.merge(df,
                              gp.var(1).rename('act_last_' + str(i) +
                                               '_actcount_var').reset_index(),
                              on=['user_id'],
                              how='left')
        #df = docount(df,tact[tact.day>lastday-7],'act_last_7',['user_id'])
        #df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id'])
        #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id'])
        gc.collect()

        page_list = list(tact['page'].unique())
        for c in page_list:
            df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c),
                         ['user_id'])
            df['act_page=' + str(c) +
               '$user_id#'] = df['act_page=' + str(c) +
                                 '$user_id#'] / df['register_time']

        for c in page_list:
            df = docount(df,
                         tact[(tact['page'] == c) & (tact.day > lastday - 8)],
                         'act_last_8_page=' + str(c), ['user_id'])
        for c in page_list:
            df = docount(df,
                         tact[(tact['page'] == c) & (tact.day > lastday - 3)],
                         'act_last_3_page=' + str(c), ['user_id'])

        df['author_id'] = df['user_id']
        df = docount(df, tact, 'act', ['author_id'])
        df['act$author_id#'] = df['act$author_id#'] / df['register_time']

        df = doiq(df, tact, 'act', ['user_id'], 'author_id')
        df['act$user_id_by_author_id_iq'] = df[
            'act$user_id_by_author_id_iq'] / df['register_time']

        df = doiq(df, tact, 'act', ['user_id'], 'video_id')
        df['act$user_id_by_video_id_iq'] = df[
            'act$user_id_by_video_id_iq'] / df['register_time']

        for i in range(8):
            df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i),
                      ['user_id'], 'author_id')
            df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i),
                      ['user_id'], 'video_id')

        #action_list = list(tact['action_type'].unique())
        for c in [0, 1, 2, 3, 5]:
            df = docount(df, tact[tact['action_type'] == c],
                         'action_type=' + str(c), ['user_id'])
            gc.collect()
            df['action_type=' + str(c) +
               '$user_id#'] = df['action_type=' + str(c) +
                                 '$user_id#'] / df['register_time']
        for c in [0, 1, 2, 3]:
            df = docount(
                df,
                tact[(tact['action_type'] == c) & (tact.day > lastday - 8)],
                'act_last_8_action_type=' + str(c), ['user_id'])
        for c in [0, 1, 2, 3]:
            df = docount(
                df,
                tact[(tact['action_type'] == c) & (tact.day > lastday - 3)],
                'act_last_3_action_type=' + str(c), ['user_id'])
        ''' 
        def getmaxcontinuedays(s):
            s = np.array(s)
            ans = 0
            t = 0
            for i in s:
                if i>0:
                    t =  t+ 1
                else:
                    if t>ans:
                        ans = t
                    t = 0
            if t>ans:
                ans=t
            return ans
  
        gp = tapp.groupby(['user_id','day']).size().unstack()
        gp = gp.fillna(0)
        
        #print (gp)
        gp['app_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1)
        #print (gp)
        df = pd.merge(df,gp.reset_index()[['user_id','app_max_continue_days']],on=['user_id'],how='left') 
         
        gp = tact.groupby(['user_id','day']).size().unstack()
        gp = gp.fillna(0)
        
        #print (gp)
        gp['act_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1)
        #print (gp)
        df = pd.merge(df,gp.reset_index()[['user_id','act_max_continue_days']],on=['user_id'],how='left') 
        '''

        del df['author_id']
        gc.collect()

        return df
Пример #7
0
def get_features(df, ed):
    df['register_time'] = ed - df.register_day + 1
    del df['register_day']
    tapp = app[app.day <= ed]
    tact = act[act.day <= ed]
    tvideo = video[video.day <= ed]
    tapp['time'] = ed - tapp.day
    tact['time'] = ed - tact.day
    tvideo['time'] = ed - tvideo.day

    if get_author_feature:
        gp = tact[(tact.time < 14) & (tact.ranks <= 500)].groupby(
            ['user_id', 'ranks']).size().unstack().reset_index()
        cols = list(gp.columns)
        for i in range(1, 501):
            if i not in cols:
                print(i)
                gp[i] = 0
        for i in range(1, 501):
            gp['a' + str(i)] = gp[i]
            del gp[i]
        df = df.merge(gp, on=['user_id'], how='left')

    df = docount(df, tapp, 'app', ['user_id'])
    df = domin(df, tapp, 'app', ['user_id'], 'day')
    df = dovar(df, tapp, 'app', ['user_id'], 'day')
    #df = domax(df,tapp,'app',['user_id'],'day')
    #df['app_day_gap'] = df['app$user_id_by_day_max']- df['app$user_id_by_day_min']
    df['app_rate'] = df['app$user_id#'] / df['register_time']

    df = docount(df, tvideo, 'video', ['user_id'])
    df = domin(df, tvideo, 'video', ['user_id'], 'day')
    df = doiq(df, tvideo, 'video', ['user_id'], 'day')
    df = doiq(df, tvideo[tvideo.time < 16], 'video16', ['user_id'], 'day')

    df['video_rate'] = df['video$user_id_by_day_iq'] / df['register_time']
    df['video_rate1'] = df['video$user_id_by_day_iq'] / df['app$user_id#']
    df['video_mean'] = df['video$user_id#'] / df['register_time']
    df['video_mean1'] = df['video$user_id#'] / df['app$user_id#']
    df['video_mean2'] = df['video$user_id#'] / df['video$user_id_by_day_iq']

    df = docount(df, tact, 'act', ['user_id'])
    df = domin(df, tact, 'act', ['user_id'], 'day')
    df = doiq(df, tact, 'act', ['user_id'], 'day')
    df = doiq(df, tact[tact.time < 16], 'act16', ['user_id'], 'day')
    df['act_rate'] = df['act$user_id_by_day_iq'] / df['register_time']
    df['act_rate1'] = df['act$user_id_by_day_iq'] / df['act$user_id#']
    df['act_mean'] = df['act$user_id#'] / df['register_time']
    df['act_mean1'] = df['act$user_id#'] / df['app$user_id#']
    df['act_mean2'] = df['act$user_id#'] / df['act$user_id_by_day_iq']

    #df = docount(df,tapp[(tapp.time<14)&(tapp.weekend==1)],'app14_weekend',['user_id'])
    df = docount(df, tapp[(tapp.time < 7) & (tapp.weekend == 1)],
                 'app7_weekend', ['user_id'])

    #df = docount(df,tvideo[(tvideo.time<14)&(tvideo.weekend==1)],'video14_weekend',['user_id'])
    #df = docount(df,tvideo[(tvideo.time<7)&(tvideo.weekend==1)],'video7_weekend',['user_id'])

    #df = docount(df,tact[(tact.time<14)&(tact.weekend==1)],'act14_weekend',['user_id'])
    #df = docount(df,tact[(tact.time<7)&(tact.weekend==1)],'act7_weekend',['user_id'])

    #df = doiq(df,tact[(tact.time<16)&(tact.ranks<50)],'act16_top50',['user_id'],'author_id')
    #df = doiq(df,tact[(tact.time<16)&(tact.ranks<100)],'act16_top100',['user_id'],'author_id')
    df = doiq(df, tact[(tact.time < 16) & (tact.ranks < 500)], 'act16_top500',
              ['user_id'], 'author_id')
    df = doiq(df, tact[(tact.time < 16) & (tact.ranks < 500)], 'act16_top500',
              ['user_id'], 'video_id')
    #df = docount(df,tact[(tact.time<16)&(tact.ranks<10)],'act16_top10',['user_id'])
    #df = docount(df,tact[(tact.time<16)&(tact.ranks<100)],'act16_top100',['user_id'])
    #df = docount(df,tact[(tact.time<16)&(tact.ranks<500)],'act16_top500',['user_id'])

    for i in range(2, 7):
        gp = tapp[tapp.time <= i][['user_id']].groupby([
            'user_id'
        ]).size().rename('last_' + str(i) + '_days_app#').reset_index()
        df = pd.merge(df, gp, on=['user_id'], how='left')

    for i in range(2, 7):
        gp = tact[tact.time <= i][['user_id']].groupby([
            'user_id'
        ]).size().rename('last_' + str(i) + '_days_act#').reset_index()
        df = pd.merge(df, gp, on=['user_id'], how='left')

    gp = tact[['user_id', 'author_id']].groupby([
        'user_id'
    ])['author_id'].nunique().rename('act_author_id_u#').reset_index()
    df = pd.merge(df, gp, on=['user_id'], how='left')

    df['act_author_id_u_mean#'] = df['act_author_id_u#'] / df['register_time']
    gp = tact[['user_id', 'video_id']].groupby([
        'user_id'
    ])['video_id'].nunique().rename('act_video_id_u#').reset_index()
    df = pd.merge(df, gp, on=['user_id'], how='left')
    df['act_video_id_u_mean#'] = df['act_video_id_u#'] / df['register_time']

    df['video_author_m'] = df['act_video_id_u#'] / df['act_author_id_u#']
    df['act_author_id_u_mean1#'] = df['act_author_id_u#'] / df[
        'act$user_id_by_day_iq']
    df['act_video_id_u_mean1#'] = df['act_video_id_u#'] / df[
        'act$user_id_by_day_iq']

    for i in [3, 7, 14]:
        for c in [0, 1, 2, 3]:
            gp = tact[tact['time'] < i][tact['page'] == c][[
                'user_id'
            ]].groupby(['user_id'
                        ]).size().rename('act_' + str(i) + '_author_page_' +
                                         str(c) + '_u#').reset_index()
            df = pd.merge(df, gp, on=['user_id'], how='left')

    for i in [3, 7, 14]:
        for c in [0, 1, 2, 3]:
            gp = tact[tact['time'] < i][tact['action_type'] == c][[
                'user_id'
            ]].groupby([
                'user_id'
            ]).size().rename('act_' + str(i) + '_author_action_type_' +
                             str(c) + '_u#').reset_index()
            df = pd.merge(df, gp, on=['user_id'], how='left')

    def get_last_gap(s):
        s = list(s)
        n = len(s)
        if n > 1:
            s.sort()
            return s[n - 1] - s[n - 2]
        return None

    gp = tapp[tapp['time'] < 16].groupby(['user_id'])['day'].unique().apply(
        get_last_gap).rename('app_last_gap').reset_index()
    df = pd.merge(df, gp, on=['user_id'], how='left')

    gp = tvideo[tvideo['time'] < 16].groupby(
        ['user_id'])['day'].unique().apply(get_last_gap).rename(
            'video_last_gap').reset_index()
    df = pd.merge(df, gp, on=['user_id'], how='left')

    gp = tact[tact['time'] < 16].groupby(['user_id'])['day'].unique().apply(
        get_last_gap).rename('act_last_gap').reset_index()
    df = pd.merge(df, gp, on=['user_id'], how='left')

    df['author_id'] = df['user_id']
    gp = tact[tact['time'] < 16][['author_id']].groupby(
        ['author_id']).size().rename('author#').reset_index()
    df = pd.merge(df, gp, on=['author_id'], how='left')

    for i in [7, 14]:
        for c in [0, 1, 2, 3]:
            gp = tact[tact['time'] < i][tact['action_type'] == c][[
                'author_id'
            ]].groupby([
                'author_id'
            ]).size().rename('act_' + str(i) + '_author_action_type_' +
                             str(c) + '_a#').reset_index()
            df = pd.merge(df, gp, on=['author_id'], how='left')

    for c in [1, 2, 3, 4]:
        gp = tact[tact['time'] < 16][tact['page'] == c][['author_id']].groupby(
            ['author_id'
             ]).size().rename('author_act_page_' + str(c) + '#').reset_index()
        df = pd.merge(df, gp, on=['author_id'], how='left')

    del df['author_id']

    del df['app$user_id#'], df['video$user_id#'], df['act$user_id#'], df[
        'act_author_id_u#'], df['act_video_id_u#']

    del df['act$user_id_by_day_iq'], df['video$user_id_by_day_iq']

    for i in range(16):
        gp = tapp[tapp.time == i].groupby(
            ['user_id']).size().rename('app_' + str(i)).reset_index()
        df = df.merge(gp, on=['user_id'], how='left')

    for i in range(16):
        gp = tvideo[tvideo.time == i].groupby(
            ['user_id']).size().rename('video_count_' + str(i)).reset_index()
        df = df.merge(gp, on=['user_id'], how='left')

    for i in range(16):
        gp = tact[tact.time == i].groupby(
            ['user_id']).size().rename('act_count_' + str(i)).reset_index()
        df = df.merge(gp, on=['user_id'], how='left')

    return df
Пример #8
0
    def get_features(df,d1,d2):
        tapp = app[(app.day>=d1) & (app.day<=d2)]
        tact = act[(act.day>=d1) & (act.day<=d2)]
        tvideo = video[(video.day>=d1) & (video.day<=d2)]
        tapp.day = tapp.day - d1
        tact.day = tact.day - d1
        tvideo.day = tvideo.day - d1
        lastday = d2-d1

        df['register_time'] = d2-df.register_day+1
        del df['register_day']
        
        #app
        df = docount(df,tapp,'app',['user_id'])
        df['app_mean#'] = df['app$user_id#']/df['register_time']
        #df = domax(df,tapp,'app',['user_id'],'day')
        #df['last_app_day'] = lastday - df['app$user_id_by_day_max']+1
        del df['app$user_id#']
        #df['app_day_missing'] = df['register_time'] - df['app$user_id#']
        #df['app$user_id#'] = df['app$user_id#']/df['register_time']
        
        #df = dovar(df,tapp,'app',['user_id'],'day')
        #df = docount(df,tapp[tapp.day>lastday-2],'app_last_2',['user_id'])        
        #df = docount(df,tapp[tapp.day>lastday-1],'app_last_1',['user_id']) 
        #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id'])
        gc.collect()
        #video
        #df = docount(df,tvideo,'video',['user_id'])
        #df['video_mean#'] = df['video$user_id#']/df['register_time']
        #df = domax(df,tvideo,'video',['user_id'],'day')
        #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1
        #del df['video$user_id_by_day_max']
        #df = doiq(df,tvideo,'video',['user_id'],'day')
        #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1
        #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq']
        #df['video$user_id#'] = df['video$user_id#']/df['register_time']
        
        #df = dovar(df,tvideo,'video',['user_id'],'day')     
        df = docount(df,tvideo[tvideo.day>lastday-2],'video_last_2',['user_id'])
        df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id'])
        #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id'])
        gc.collect()
        #act
        #gp = act.groupby(['user_id','day']).size().unstack()
        #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left')   
        #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left')
        #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left')        
        
        #df = docount(df,tact,'act',['user_id'])
        #df['act_mean#'] = df['act$user_id#']/df['register_time']
        df = domax(df,tact,'act',['user_id'],'day')
        df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1
        del df['act$user_id_by_day_max']
        #df = doiq(df,tact,'act',['user_id'],'day')
        #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1
        #df['act_day_missing'] = df['register_time'] - df['act$user_id_by_day_iq']
        #df['act$user_id#'] = df['act$user_id#']/df['register_time']
        
        #gp = tact.groupby(['user_id','day']).size().unstack()
        #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left')   
        #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left')
        #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left')

        #df = dovar(df,tact,'act',['user_id'],'day')      
        df = docount(df,tact[tact.day>lastday-2],'act_last_2',['user_id']) 
        df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id'])
        #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id'])
        gc.collect()
        
        #page_list = list(tact['page'].unique())
                
        for c in [0,1,2,3]: 
            df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-3)],'act_last_3_page='+str(c),['user_id']) 
            df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-2)],'act_last_2_page='+str(c),['user_id'])
            df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-1)],'act_last_1_page='+str(c),['user_id']) 
        
        df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'author_id')  
        df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'video_id')
        
        df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'author_id')  
        df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'video_id')
        
        df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'author_id')  
        df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'video_id')
        
        for c in [0,1,2,3]: 
            df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-3)],'act_last_3_action_type='+str(c),['user_id'])
            df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-2)],'act_last_2_action_type='+str(c),['user_id'])
            df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-1)],'act_last_1_action_type='+str(c),['user_id'])

        
        gc.collect()
        
        
        return df