def import_average_category(self, channel_ids=None, start=None, stop=None, automatic_flush=True): from rockpack.mainsite.services.video.models import VideoInstance, Channel query = readonly_session.query(VideoInstance.category, Channel.id).join(Channel, Channel.id == VideoInstance.channel).order_by(Channel.id) if channel_ids: query = query.filter(Channel.id.in_(channel_ids)) if start: query = query.filter(Channel.date_updated.between(start, stop)) category_map = {} for instance_cat, channel_id in query: channel_cat_counts = category_map.setdefault(channel_id, {}) current_count = channel_cat_counts.setdefault(instance_cat, 0) channel_cat_counts[instance_cat] = current_count + 1 app.logger.info('Channel category map size: %d', len(category_map)) ec = ESChannel.updater(bulk=True) for channel_id, c_map in category_map.iteritems(): ec.set_document_id(channel_id) ec.add_field('category', main_category(c_map)) ec.update() ec.reset() if automatic_flush: self.conn.flush_bulk(forced=True)
def import_channels(self, start=None, stop=None, automatic_flush=True): from rockpack.mainsite.services.video.models import Channel, VideoInstance, Video with app.test_request_context(): channels = Channel.query.filter( Channel.public == True, Channel.visible == True, Channel.deleted == False ).options( joinedload(Channel.category_rel), joinedload(Channel.metas), joinedload(Channel.owner_rel) ) if start: channels = channels.filter(Channel.date_updated.between(start, stop)) total = channels.count() app.logger.info('importing %d channels', total) start_time = time.time() ec = ESChannel.inserter(bulk=True) count = 1 query = VideoInstance.query.join( Video, (Video.id == VideoInstance.video) & (Video.visible == True) ).filter(VideoInstance.deleted == False).group_by(VideoInstance.channel) if start: # Restrict the counts selected to the channels we want query = query.join( Channel, (Channel.id == VideoInstance.channel) & (Channel.date_updated.between(start, stop)) ) query = query.values(VideoInstance.channel, func.count(VideoInstance.id)) video_counts = dict(query) for channel in channels.yield_per(6000): channel._video_count = video_counts.get(channel.id) or 0 ec.insert(channel.id, channel) if app.logger.isEnabledFor(logging.DEBUG): self.print_percent_complete(count, total) count += 1 if automatic_flush: ec.flush_bulk() app.logger.debug('finished in %d seconds', time.time() - start_time)
def import_channel_share(self, automatic_flush=True): from rockpack.mainsite.services.share.models import ShareLink from rockpack.mainsite.services.user.models import UserActivity, User from rockpack.mainsite.services.video.models import VideoInstance, Channel total = 0 missing = 0 start_time = time.time() def _normalised(val, max_val, min_val): try: return (val - min_val) / (abs(max_val) - abs(min_val)) except (ZeroDivisionError, decimal.DivisionByZero, decimal.InvalidOperation): return 0 def _update_channel_id(id, val, max_val, min_val): channel_dict[id] = channel_dict.setdefault(id, 0) + _normalised(val, max_val, min_val) # The strength of actions decay until any older than zulu have no effect zulu = datetime.now() - timedelta(days=app.config.get('CHANNEL_RANK_ZULU', 1)) time_since_zulu = (datetime.utcnow() - zulu).total_seconds() for locale in ['en-gb', 'en-us']: app.logger.debug('starting for %s', locale) channel_dict = {} channel_shares = {} summation = func.sum( (time_since_zulu - (func.extract('epoch', datetime.utcnow()) - func.extract('epoch', UserActivity.date_actioned))) / time_since_zulu ) # activity for channels from videos query = readonly_session.query( distinct(Channel.id).label('channel_id'), summation.label('summed') ).join( VideoInstance, VideoInstance.channel == Channel.id ).join( UserActivity, UserActivity.object_id == VideoInstance.id ).join( User, User.id == UserActivity.user ).filter( UserActivity.action == 'star', UserActivity.object_type == 'video_instance', UserActivity.date_actioned > zulu, User.locale == locale ).group_by(Channel.id) summed = query.subquery().columns.summed q_max, q_min = UserActivity.query.session.query(func.max(summed), func.min(summed)).one() for id, count in query.yield_per(6000): channel_dict.setdefault(id, {}) channel_dict[id]['user_activity'] = [count, _normalised(count, q_max, q_min)] channel_dict[id]['norm_user_activity'] = _normalised(count, q_max, q_min) app.logger.debug('user activity done') summation = func.sum( (time_since_zulu - (func.extract('epoch', datetime.utcnow()) - func.extract('epoch', ShareLink.date_created))) / time_since_zulu ) # activity for channel shares query = readonly_session.query( distinct(Channel.id).label('channel_id'), summation.label('summed') ).join( ShareLink, ShareLink.object_id == Channel.id ).join( User, User.id == ShareLink.user ).filter( Channel.deleted == False, Channel.public == True, ShareLink.object_type == 'channel', ShareLink.date_created > zulu, ShareLink.click_count > 0, User.locale == locale ).group_by(Channel.id) summed = query.subquery().columns.summed q_max, q_min = ShareLink.query.session.query(func.max(summed), func.min(summed)).one() channel_share_vals = (q_max, q_min) for id, count in query.yield_per(6000): channel_dict.setdefault(id, {}) channel_shares[id] = count channel_dict[id]['share_link_channel'] = [count, _normalised(count, q_max, q_min)] app.logger.debug('channel shares done') # activity for videos shares of channels query = readonly_session.query( distinct(Channel.id).label('channel_id'), summation.label('summed') ).join( VideoInstance, VideoInstance.channel == Channel.id ).join( ShareLink, ShareLink.object_id == VideoInstance.id ).join( User, User.id == ShareLink.user ).filter( Channel.deleted == False, Channel.public == True, ShareLink.object_type == 'video_instance', ShareLink.date_created > zulu, ShareLink.click_count > 0, User.locale == locale ).group_by(Channel.id) summed = query.subquery().columns.summed q_max, q_min = ShareLink.query.session.query(func.max(summed), func.min(summed)).one() for id, count in query.yield_per(6000): channel_dict.setdefault(id, {}) channel_dict[id]['share_link_video'] = [count, _normalised(count, q_max, q_min)] val = channel_shares.get(id, 0) # We may get None returned in the data if None in channel_share_vals: channel_share_vals = [0, 0] channel_dict[id]['norm_share_link_channel'] = channel_dict[id].setdefault('norm_share_link_channel', 0) + _normalised(count + val, q_max + channel_share_vals[0], q_min + channel_share_vals[1]) app.logger.debug('video shares done') app.logger.debug('... updating elasticsearch for %s ...', locale) done = 1 i_total = len(channel_dict) ec = ESChannel.updater(bulk=True) for id, _dict in channel_dict.iteritems(): try: count = 0 for k, v in _dict.iteritems(): if k.startswith('norm'): count += v if count == 0: continue ec.set_document_id(id) ec.add_field('normalised_rank[\'%s\']' % locale, float(count)) ec.update() except exceptions.DocumentMissingException: missing += 1 finally: ec.reset() total += 1 if app.logger.isEnabledFor(logging.DEBUG): self.print_percent_complete(done, i_total) done += 1 if automatic_flush: ESChannel.flush() app.logger.debug('%s total updates in two passes. finished in %s seconds (%s channels not in es)', total, time.time() - start_time, missing)