Python data_from_hive 예제들, data_extract.data_from_hive Python 예제들

예제 #1

0

파일 보기

 def download_new_userid(self):
     for day in range(self.gap.days):
         date_start = (self.s_format +
                       timedelta(days=day)).strftime('%Y-%m-%d')
         sql = '''
         select distinct_id
         from sc_events
         where date = date \'''' + date_start + '''\'
         and event = 'firstLogin'
         '''
         downloaded_userid = data_extract.data_from_hive(
             sql, 'sc').drop_duplicates(keep='first')
         np.savetxt('input/userid_' + date_start,
                    downloaded_userid,
                    fmt='%s')
     print("all userid have downloaded!")

예제 #2

0

파일 보기

    def download_new_userid(self, date_start):
        """
        download new user id in output directory, file name like 'userid_20XX-XX-XX'

        notes:
            if you dont download userid, please download them in advance
        """
        for day in range(self.gap.days):
            date_start = (self.s_format +
                          timedelta(days=day)).strftime('%Y-%m-%d')
            sql = '''
            select distinct distinct_id
            from sc_events
            where date = date \'''' + date_start + '''\'
            and event = 'firstLogin'
            '''
            downloaded_userid = data_extract.data_from_hive(
                sql, 'sc').drop_duplicates(keep='first')
            np.savetxt('output/userid_' + date_start,
                       downloaded_userid,
                       fmt='%s')
            print("done")

예제 #3

0

파일 보기

    def read_once_y(self, sql, filename, **kwargs):
        '''
        test
        :return: y_label for one day
        '''
        # check format correct or not?
        # print (self.target_user)
        # load data from sc database
        df_next = data_extract.data_from_hive(sql, "sc")

        # merge next date result
        df_current = pd.read_csv(filename, header=None,
                                 names=['full_id'
                                        ]).drop_duplicates(keep='first')
        combined = pd.merge(df_current,
                            df_next,
                            left_on='full_id',
                            right_on='y',
                            how='left')
        del df_current
        del df_next
        return combined

예제 #4

0

파일 보기

    def basic_info(self, date_start):
        """
        extract user basic info from database
        such as, gender, status, operation system, city

        note:
            database include sc(os, city), users(gender, status)

        Attributes:
            self.date_start: SignIn day for new user

        Return:
            distinct_id, gender, status, operation system, city
        """

        # features: os, city
        # database: sc
        # return: distinct_id, os, city
        filename = 'input/userid_' + date_start
        target_user = self.load_userid(filename)
        # sql
        sql = '''
        select distinct_id, "$os" as os, "$city" as city
        from sc_events
        where distinct_id in (''' + target_user + ''')
        and date = date \'''' + date_start + '''\'
        group by distinct_id, "$os", "$city"
        '''
        # run sql
        os_city = data_extract.data_from_hive(sql, "sc")
        # result is:
        # distinct_id, $os, $city

        # check  duplicated id
        # remove duplicated id
        self.check_duplicated(os_city, 'distinct_id')

        # features: gender, status
        # database: users
        # return: id, gender, status
        sql = '''
               select distinct id, gender, status  
               from users
               where id in (''' + target_user + ''')
               and date(time) = date \'''' + date_start + '''\'
               '''
        # run sql
        gender_status = data_extract.data_from_hive(sql, "default")
        gender_status.loc[:, ['status']] = gender_status.loc[:, ['status']]\
            .replace('59de007f7cd42330e56243b3','student') \
            .replace('59de00617cd42330e56243b2', 'worker')\
            .replace('59de008a4275dd7395c49cb5', 'other')

        # check duplicated
        self.check_duplicated(gender_status, 'id')

        # result is:
        # id, gender, status

        # merege data from sc and users
        # load target user id
        df_userid = self.load_df_userid(filename)
        # return: df_userid & os_city
        combined = pd.merge(df_userid,
                            os_city,
                            left_on='userid',
                            right_on='distinct_id',
                            how='left').drop(['distinct_id'], axis=1)
        # merge: combined and gender_status
        combined = pd.merge(combined,
                            gender_status,
                            left_on='userid',
                            right_on='id',
                            how='left')
        del os_city
        del gender_status
        # drop old users
        # for those users they dont have same singin date like sc
        combined = combined.loc[combined.loc[:,
                                             'id'].notnull(), :].drop(['id'],
                                                                      axis=1)
        #return combined, os_city, gender_status
        return combined
        print('done')

예제 #5

0

파일 보기

    def add_extra_features(self):
        """
        extract additional features
        such as, payorder, isgroup, activities_page

        :return:
        userid, payorder, isgroup, activities_page
        """
        # initial
        extra_battle = pd.DataFrame([])

        # loop
        for day in range(self.gap.days):
            # update date_start
            date_start = (self.s_format +
                          timedelta(days=day)).strftime('%Y-%m-%d')

            # load target user id
            filename = 'input/userid_' + date_start
            target_user = self.load_userid(filename)

            # load target user df
            userid = self.load_df_userid(filename)

            # sql
            sql = '''
                    select distinct buyer as payorder
                    from orders
                    where date(created) = date(\'''' + date_start + '''\')
                    and firstorder = true
                    and paid = true
                    and buyer in  (''' + target_user + ''')
                    '''

            # run sql
            payorder = data_extract.data_from_hive(sql, 'default')
            payorder = pd.merge(userid,
                                payorder,
                                left_on='userid',
                                right_on='payorder',
                                how='left')
            # fill null
            payorder.loc[:, 'payorder'].fillna(999, inplace=True)
            payorder.loc[:, 'payorder'] = payorder.loc[:, 'payorder'].map(
                lambda x: 0 if x == 999 else 1)
            # replace null
            # payorder.to_csv('newX_payorder.csv', encoding='utf-8')is
            # return payorder

            # sql: load isgroup
            sql = '''
            select buyer as general_isgroup, paid, state
            from db_orders
            where date("create") = date(\'''' + date_start + '''\')
            and "group" is not null
            and groupstate is not null
            and buyer in  (''' + target_user + ''')
            '''
            isgroup = data_extract.data_from_hive(sql, 'default')
            isgroup = pd.merge(userid,
                               isgroup,
                               left_on='userid',
                               right_on='general_isgroup',
                               how='left')

            # sql: load activities
            sql = '''
            select distinct_id as activities_id, "$title" as page
            from sc_events
            where event = '$AppViewScreen'
            and "$screen_name" = '活动页'
            and date = date(\'''' + date_start + '''\')
            and "$title" != '跳转中'
            and "$title" != ''
            and distinct_id in (''' + target_user + ''')
            '''
            activities_page = data_extract.data_from_hive(sql, 'sc')
            activities_page = pd.merge(userid,
                                       activities_page,
                                       left_on='userid',
                                       right_on='activities_id',
                                       how='left')
            activities_page = activities_page.groupby(
                ['userid',
                 'page']).size().unstack(fill_value=0).reset_index(level=0)

            # merge all together
            combined = pd.merge(payorder,
                                isgroup,
                                left_on='userid',
                                right_on='userid',
                                how='left')
            combined = pd.merge(combined,
                                activities_page,
                                left_on='userid',
                                right_on='userid',
                                how='left')
            extra_battle = extra_battle.append(combined, ignore_index=True)

        # return final result
        extra_battle.to_csv('output/extra_features.csv', encoding='utf-8')
        return extra_battle
        print('done')

예제 #6

0

파일 보기

    def key_path(self, date_start):
        """
        extract key path
        such as: 主页 -->

        :param date_start:
        :return:
        """

        # target user
        filename = 'input/userid_' + date_start
        target_user = self.load_userid(filename)

        # extract completed sequence of actions
        sql = '''
        select distinct_id as key_path_id, time, event, "$screen_name", action_type
        from sc_events
        where distinct_id in (''' + target_user + ''')
        and date = date(\'''' + date_start + '''\')
        and event in ('$AppViewScreen', 'reviewClick', 'reviewDetail','subscribe', 'favorite')
        and "$screen_name" is not null 
        and "$screen_name" != ''
        and "$screen_name" != '全部'
        and "$screen_name" in ('首页', '文章详情页', '买手主页', '优质买手列表页', '最新买手列表页', '买手分类页表页', '发现买手')
        '''
        # run sql
        test = data_extract.data_from_hive(sql, "sc")
        # sort by time
        test = test.sort_values(['key_path_id', 'time'], axis=0)
        #return test
        #print ('done')
        # add sub or unsub
        conditions = [
            test['action_type'].notnull(), test['event'] == 'favorite'
        ]
        choices = [test['action_type'], test['event']]
        test['actions'] = np.select(conditions,
                                    choices,
                                    default=test['$screen_name'])
        # drop duplicated columns or useless columns
        test.drop(['$screen_name', 'action_type', 'event', 'time'],
                  axis=1,
                  inplace=True)
        # group by sum
        test = test.groupby(['key_path_id'], axis=0).sum()
        # reset index to columns
        test.reset_index(level=0, inplace=True)

        # identify key path
        # path0: 首页，文章详情页
        test['path1'] = test.loc[:, 'actions'].map(
            lambda x: len(re.findall('首页+文章详情页+', x)))
        # path1: 首页，文章详情页，买手主页
        test['path2'] = test.loc[:, 'actions'].map(
            lambda x: len(re.findall('首页+文章详情页+买手主页', x)))
        # path2: 优质买手列表页，买手主页，关注
        test['path3'] = test.loc[:, 'actions'].map(
            lambda x: len(re.findall('优质买手列表页+买手主页+关注', x)))
        # path3: 最新买手列表页，买手主页，关注
        test['path4'] = test.loc[:, 'actions'].map(
            lambda x: len(re.findall('最新买手列表页+买手主页+关注', x)))
        # path4: 买手分类列表页，买手主页，关注
        test['path5'] = test.loc[:, 'actions'].map(
            lambda x: len(re.findall('买手分类列表页+买手主页+关注', x)))
        # path5: 发现买手，买手主页，关注
        test['path6'] = test.loc[:, 'actions'].map(
            lambda x: len(re.findall('发现买手+买手主页+关注', x)))

        return test
        print('done')

예제 #7

0

파일 보기

    def user_demand(self, date_start):
        """
        extract user demands
        such as: default_sub_channel, sub, unsub, read, favorite, buy, share

        :param
            date_start: first login date

        :return:
            user_demand_id, default_sub_channel, sub, unsub, read, favorite, buy, share
        """

        # target user
        filename = 'input/userid_' + date_start
        target_user = self.load_userid(filename)

        # features: default_buyer, today_read_rate
        # db: userlogs
        # sql
        sql = ''' SELECT  a.user, count(a.channel) as default_buyer ,count(b.channel) as today_read 
            from 
            (SELECT user,channel
            FROM userlogs
            where action = 'CHANNEL_SUBSCRIBE'
            AND batchsubscribe  = true
            and date  = date(\'''' + date_start + '''\')
            and user in (''' + target_user + ''')
            group by user, channel
            )a
            full JOIN
            (SELECT user,channel
            FROM userlogs
            WHERE action like 'REVIEW_%'
            and  date =date(\'''' + date_start + '''\')
            and user in (''' + target_user + ''')
            group by user, channel
            )b
            on a.user= b.user AND a.channel = b.channel 
            GROUP by a.user
        '''
        # run sql
        today_rate = data_extract.data_from_hive(sql, "default")
        # key = user (too many)
        # check duplicated
        self.check_duplicated(today_rate, 'user')

        # features: unsub & sub counts
        # db: sc_events
        # sql
        sql = '''
                select a.distinct_id as sub_id, b.unsub, c.sub
                from 
                (select distinct_id
                from sc_events
                where date = date( \'''' + date_start + '''\')
                and event = 'firstLogin')a
                left join
                (select distinct_id, count(*) as unsub
                from sc_events
                where date  = date(\'''' + date_start + '''\') 
                and event = 'subscribe' 
                and action_type = '取关'
                and distinct_id in (''' + target_user + ''')
                GROUP BY distinct_id)b
                on a.distinct_id = b.distinct_id
                left JOIN
                (select distinct_id, count(*) as sub
                from sc_events
                where date = date(\'''' + date_start + '''\')
                and event = 'subscribe' 
                and action_type = '关注'
                and distinct_id in (''' + target_user + ''')
                GROUP BY distinct_id)c
                ON a.distinct_id = c.distinct_id
                '''
        sub = data_extract.data_from_hive(sql, "sc")
        # check duplicated
        self.check_duplicated(sub, 'sub_id')
        # fill null value
        #sub.fillna(0, inplace=True)
        #return sub
        #print ("done")
        # key = sub_id (ok)

        # features: action_id, purchase, share, favorite, read
        # db: sc_events
        # sql
        sql = '''
                            SELECT a.distinct_id as action_id, b.purchase, c.share, d.favorite, e.read 
                            FROM 
                            (select distinct_id
                            from sc_events
                            where date = date( \'''' + date_start + '''\')
                            and event = 'firstLogin')a
                            left join
                            (select distinct_id,count(*) as purchase
                            from sc_events
                            where event = 'payOrder'  
                            and date = date( \'''' + date_start + '''\')
                            and distinct_id in (''' + target_user + ''')
                            group by distinct_id)b
                            on a.distinct_id = b.distinct_id
                            left JOIN
                            (select distinct_id,count(*) as share
                            from sc_events
                            where event = 'share'  
                            and date = date(\'''' + date_start + '''\')
                            and distinct_id in (''' + target_user + ''')
                            group by distinct_id)c
                            ON a.distinct_id = c.distinct_id
                            left JOIN
                            (select distinct_id,count(*) as favorite
                            from sc_events
                            where event = 'favorite'  and 
                            date = date(\'''' + date_start + '''\')
                            and distinct_id in (''' + target_user + ''')
                            group by distinct_id)d
                            ON a.distinct_id = d.distinct_id
                            left  JOIN
                            (select distinct_id,count(*) as read
                            from sc_events
                            where event = 'reviewDetail' 
                            and date = date(\'''' + date_start + '''\')
                            and distinct_id in (''' + target_user + ''')
                            group by distinct_id)e
                            ON a.distinct_id = e.distinct_id
                            '''
        # run sql
        actions = data_extract.data_from_hive(sql, "sc")
        # check duplicated
        self.check_duplicated(actions, 'action_id')
        # fill null value
        #actions.fillna(0, inplace=True)
        #return actions
        #print ('done')
        # key = action_id (ok)

        # merge
        # today_read & sub
        combined = pd.merge(sub,
                            today_rate,
                            left_on='sub_id',
                            right_on='user',
                            how='left').drop('user', axis=1)
        # combined & actions
        combined = pd.merge(combined,
                            actions,
                            left_on='sub_id',
                            right_on='action_id',
                            how='left').drop('action_id', axis=1)

        # fill null values
        columns = [
            'unsub', 'sub', 'today_read', 'purchase', 'share', 'favorite',
            'read'
        ]
        combined.loc[:, columns] = combined.loc[:, columns].fillna(0)

        return combined
        print('done')

예제 #8

0

파일 보기

    def exteral_reasons(self, date_start):
        """
        extract features about exteral reasons
        such as: channel, activity, festival, weekend

        :param
            date_start: first Login day

        :return:
            exteral_reasons_id, channel, activity, festival, weekend, FirstLoginDay
        """

        # target user
        filename = 'input/userid_' + date_start
        target_user = self.load_userid(filename)

        # features: FirstLoginDay, exteral_reasons_id, fromchannel, activity(coin, lucky_money), weekend
        # database: sc_events(FirstLoginDay, exteral_reasons_id, weekend), users(fromchannel), db_coingrouplinks(coin),
        #           db_dogyearinvitelogs(lucky_money)

        # features: FirstLoginDay, exteral_reasons_id (comp), weekend
        # database: sc_events
        # sql
        sql = '''
            select distinct_id as exteral_reasons_id, date
            from sc_events
            where date = date(\'''' + date_start + '''\')
            and event = 'firstLogin'
        '''
        # run sql
        first_weekend = data_extract.data_from_hive(sql, "sc")
        # check duplicated
        self.check_duplicated(first_weekend, 'exteral_reasons_id')

        # assign weekend
        # 1 -> weekend; 0 -> weekday
        first_weekend['weekend'] = first_weekend.loc[:, 'date']\
                                 .map(lambda x: 1 if datetime.strptime(x, '%Y-%m-%d').isoweekday() >= 6 else 0)

        # features: user_id (too many), fromchannel
        # db: users
        # sql
        sql = '''
            select id as user_id, fromchannel
            from users
            where date(time) = date(\'''' + date_start + '''\')  
            and id in (''' + target_user + ''')      
        '''
        # run sql
        fromchannel = data_extract.data_from_hive(sql, "default")
        # check duplicated
        self.check_duplicated(fromchannel, 'user_id')

        # features: coin, lucky_money
        # db: db_coingrouplinks, db_dogyearinvitelogs
        # sql
        sql = '''select distinct user as coin
                from db_coingrouplinks
                where date(createdat) = date(\'''' + date_start + '''\')'''
        # run sql
        coin = data_extract.data_from_hive(sql, "default")
        # check duplicated
        self.check_duplicated(coin, 'coin')

        # sql
        sql = '''select distinct invitee as lucky_money
                from db_dogyearinvitelogs
                where date(createdat) = date(\'''' + date_start + '''\')
                and invitee is not null '''
        # run sql
        lucky_money = data_extract.data_from_hive(sql, "default")
        # check duplicated
        self.check_duplicated(lucky_money, 'lucky_money')

        # merge
        # merge one: first_weekend & fromchannel
        combined = pd.merge(first_weekend,
                            fromchannel,
                            left_on='exteral_reasons_id',
                            right_on='user_id',
                            how='left').drop('user_id', axis=1)
        # merge two: combined & coin
        combined = pd.merge(combined,
                            coin,
                            left_on='exteral_reasons_id',
                            right_on='coin',
                            how='left')
        # merge three: combined & lucky_money
        combined = pd.merge(combined,
                            lucky_money,
                            left_on='exteral_reasons_id',
                            right_on='lucky_money',
                            how='left')
        del first_weekend
        del fromchannel
        del coin
        del lucky_money

        # fill null values
        combined.loc[:, 'coin'].fillna('999', inplace=True)
        combined.loc[:, 'lucky_money'].fillna('999', inplace=True)
        # replace
        combined.loc[:, 'coin'] = combined.loc[:, 'coin'].map(
            lambda x: 0 if x == '999' else 1)
        combined.loc[:, 'lucky_money'] = combined.loc[:, 'lucky_money'].map(
            lambda x: 0 if x == '999' else 1)

        return combined
        print('done')

예제 #9

0

파일 보기

    def member_center(self, date_start):
        """
        extract information about member center
        such as, completed_info, SignIn

        note:
            database include, db_pointslogs, db_checkinlogs

        Attributes:
            self.date_start: firstLogin day

        Return:
            member_center_id, completed_info, SignIn
        """

        # target user
        filename = 'input/userid_' + date_start
        target_user = self.load_userid(filename)

        # features: completed_info, SignIn
        # database: db_pointslogs, db_checkinlogs
        # return: completed_info, SignIn
        # sql code
        sql = '''
        select distinct user as completed_info
        from db_pointslogs
        where date(createdat) = date(\'''' + date_start + '''\')
        and task = 'completeUserInfo'
        and user in (''' + target_user + ''')'''
        # run sql
        completed_info = data_extract.data_from_hive(sql, "default")

        # sql
        sql = '''select user, count(date(checkintime)) as SignIn
        from db_checkinlogs
        where date(checkintime) = date(\'''' + date_start + '''\')
        and user in (''' + target_user + ''')
        group by user
        '''
        # run sql
        SignIn = data_extract.data_from_hive(sql, "default")
        # check duplicated
        self.check_duplicated(SignIn, 'user')

        # merge
        df_userid = self.load_df_userid(filename)
        # merge
        # merge one: df_userid & completed_info
        combined = pd.merge(df_userid,
                            completed_info,
                            left_on='userid',
                            right_on='completed_info',
                            how='left')
        # merge two: df_userid & SignIn
        combined = pd.merge(combined,
                            SignIn,
                            left_on='userid',
                            right_on='user',
                            how='left').drop('user', axis=1)

        # fill null values
        combined.loc[:, 'completed_info'].fillna('999', inplace=True)
        # replace
        combined.loc[:,
                     'completed_info'] = combined.loc[:, 'completed_info'].map(
                         lambda x: 0 if x == '999' else 1)
        combined.loc[:, 'SignIn'].fillna(0, inplace=True)
        return combined
        print('done')