예제 #1
0
class Feedback(object):

    def __init__(self, config=None):
        if config is None:
            config = {}
        self.redis = StrictRedis(**config['redis'])

    def get_allocation_key(self, site_id: str, experiment: str, recommender_id: str, allocation_time: datetime=None) -> str:
        if allocation_time is None:
            allocation_time = datetime.utcnow()

        long_key = '/'.join([site_id, experiment, recommender_id])
        short_key = '{:08x}'.format(mmh3.hash(long_key) & INT_MAX)

        timestamp = allocation_time.strftime(HOUR_FORMAT)
        return short_key + ':' + timestamp

    def insert(self, allocation_key: str, event_type: str, rec_id: str, ttl=timedelta(days=60)):
        full_key = event_type + ':' + allocation_key
        key_exists = self.redis.exists(full_key)

        self.redis.execute_command('PFADD', full_key, rec_id)

        if not key_exists:
            timestamp = allocation_key.rsplit(':', 1)[-1]
            allocation_time = datetime.strptime(timestamp, HOUR_FORMAT)
            expire_time = allocation_time + ttl
            self.redis.expireat(full_key, expire_time)

    def count_distinct(self, allocation_key: str, event_type: str) -> int:
        return self.redis.execute_command('PFCOUNT', event_type + ':' + allocation_key)

    def increment_arm(self, event: Dict):
        key = self.get_allocation_key(site_id=event['site_id'],
                                      experiment=event['experiment'],
                                      recommender_id=event['recommender_id'],
                                      allocation_time=event['allocation_time']
                                      )
        rec_id = event.get('recset', 'NA') + ':' + event.get('rec_group', 'NA') + ':' + event.get('rec_position', 'NA')
        self.insert(key, event['event_type'], rec_id)

    def flush_redis(self):
        self.redis.flushall()

    def event_listener(self):
        '''TODO: integrate with kafka --currently stubbed'''

        event_stream = iter([
            dict(site_id='666', event_type='rec_viewed', rec_time=datetime(2016, 11, 1, 12),
                 recset='abc', rec_group='a', rec_position='1', recommender_id='fmv1', experiment='default'),
            dict(site_id='666', event_type='rec_viewed', rec_time=datetime(2016, 11, 1, 12),
                 recset='abc', rec_group='a', rec_position='1', recommender_id='fmv1', experiment='default'),
            dict(site_id='666', event_type='rec_viewed', rec_time=datetime(2016, 11, 1, 12),
                 recset='abc', rec_group='a', rec_position='1', recommender_id='fmv1', experiment='default'),
            dict(site_id='666', event_type='rec_clicked', rec_time=datetime(2016, 11, 1, 12),
                 recset='abc', rec_group='a', rec_position='1', recommender_id='fmv1', experiment='default'),
            dict(site_id='666', event_type='rec_clicked', rec_time=datetime(2016, 11, 1, 12),
                 recset='abc', rec_group='a', rec_position='1', recommender_id='fmv1', experiment='default'),
            dict(site_id='666', event_type='rec_clicked', rec_time=datetime(2016, 11, 1, 12),
                 recset='abc', rec_group='a', rec_position='1', recommender_id='fmv1', experiment='default'),
            dict(site_id='238', event_type='rec_viewed', rec_time=datetime(2016, 11, 2, 8),
                 recset='abc', rec_group='a', rec_position='1', recommender_id='collb', experiment='default'),
            dict(site_id='238', event_type='rec_clicked', rec_time=datetime(2016, 11, 2, 8),
                 recset='abc', rec_group='a', rec_position='1', recommender_id='collb', experiment='default')
        ])

        for event in event_stream:
            if event['event_type'] in ['rec_viewed', 'rec_clicked']:
                self.increment_arm(event)

    def perdelta(self, start: datetime, end: datetime, delta: timedelta) -> List[datetime]:
        output = []
        curr = start
        while curr < end:
            curr = curr + delta
            output.append(curr)
        return output

    def get_recent_hours(self, days_ago: int=30) -> List[datetime]:
        end_hour = datetime.now().replace(minute=0, second=0, microsecond=0)
        start_hour = end_hour - timedelta(days=days_ago)
        return self.perdelta(start_hour, end_hour, timedelta(hours=1))

    def enumerate_keys(self, site_ids: List[str], experiments: List[str], recommender_ids: List[str]) -> pd.DataFrame:
        header = ['site_id', 'experiment', 'recommender_id', 'date_hour', 'trials', 'successes']
        rows = []
        for site_id in site_ids:
            for experiment in experiments:
                for recommender_id in recommender_ids:
                    for date_hour in self.get_recent_hours():
                        key = self.get_allocation_key(site_id, experiment, recommender_id, date_hour)
                        view_count = self.count_distinct(key, 'rec_viewed')
                        click_count = self.count_distinct(key, 'rec_clicked')
                        rows.append([site_id, experiment, recommender_id, date_hour, view_count, click_count])
        df = pd.DataFrame(data=rows, columns=header)
        # TODO: date filter
        return df.groupby(['site_id', 'experiment', 'recommender_id']).sum().reset_index()