def compute_video_rank_table(): task_name = '计算视频排名对照表' coll = db['video'] # 获得collection的句柄 count = coll.estimated_document_count() top_n = 60 print(count) keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] task = ProgressTask(task_name, top_n * len(keys), collection=db['tracer']) o = {} skip = int(count / 100) for each_key_index in range(len(keys)): each_key = keys[each_key_index] o[each_key] = {} o['name'] = 'video_rank' o[each_key]['rate'] = [] i = 1 last_value = 9999999999 # logger.info("开始计算视频{}排名对照表".format(each_key)) video = coll.find({}, { 'title': 1 }).limit(200).sort(each_key, DESCENDING).batch_size(200) top = 1 for each_video in list(video): o[each_key][each_video['title']] = top top += 1 while i <= top_n: task.current_value = i + top_n * each_key_index video = list( coll.find({ each_key: { '$lt': last_value } }, { each_key: 1 }).limit(1).skip(skip).sort(each_key, DESCENDING)) print(video) if len(video) != 0: video = video[0] else: i += 1 continue if each_key not in video: break last_value = video[each_key] o[each_key]['rate'].append(last_value) print(last_value) i += 1 o['update_time'] = datetime.datetime.utcnow() + datetime.timedelta(hours=8) output_coll = db['rank_table'] output_coll.update_one({'name': 'video_rank'}, {'$set': o}, upsert=True)
def gen_online(): task_name = "生成在线人数爬取链接" t = ProgressTask(task_name, 1, collection=db['tracer']) ONLINE_URL = 'https://www.bilibili.com/video/online.html' redis_connection.rpush("online:start_urls", ONLINE_URL) t.current_value = 1
def author_fans_rate_caculate(): logging.basicConfig( level=logging.INFO, format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') logger = logging.getLogger(__name__) coll = db['author'] # 获得collection的句柄 logger.info('开始计算粉丝增速') c_datetime = datetime.datetime.now() end_date = ( datetime.datetime(c_datetime.year, c_datetime.month, c_datetime.day) - datetime.timedelta(1)).timestamp() start_date = ( datetime.datetime(c_datetime.year, c_datetime.month, c_datetime.day) - datetime.timedelta(2)).timestamp() task = ProgressTask("计算粉丝增速", coll.count_documents({}), collection=db['tracer']) c = 0 for each in coll.find({}, {'mid': 1, '_id': 0}).batch_size(200): c += 1 task.current_value = c ag = coll.aggregate([{ '$match': { 'mid': each['mid'] } }, { '$project': { 'mid': 1, 'data': { "$filter": { "input": "$data", "as": "each_data", "cond": { "$gt": [ "$$each_data.datetime", datetime.datetime.now() - datetime.timedelta(7) ] } } } } }]).batch_size(1) each_author = next(ag) if 'data' in each_author and each_author['data'] != None: data = sorted(each_author['data'], key=lambda x: x['datetime']) if len(data) >= 2: logger.info(each_author['mid']) x = tuple(map(lambda x: x['datetime'].timestamp(), data)) y = tuple(map(lambda x: x['fans'], data)) inter_fun = interp1d(x, y, kind='linear') if start_date > x[0] and end_date < x[-1]: inter_data = inter_fun([start_date, end_date]) delta_fans = inter_data[1] - inter_data[0] coll.update_one({'mid': each_author['mid']}, {"$set": { 'cRate': int(delta_fans) }})
def calculate_author_rank(): task_name = "计算作者排名数据" keys = ['cFans', 'cArchive_view', 'cArticle_view'] count = coll.count_documents({keys[0]: {'$exists': 1}}) t = ProgressTask(task_name, count * len(keys), collection=db['tracer']) for each_key in keys: logger.info("开始计算作者{}排名".format(each_key)) i = 1 authors = coll.find({ each_key: { '$exists': 1 } }, { 'mid': 1, 'rank': 1, each_key: 1 }).batch_size(300).sort(each_key, DESCENDING) if each_key == 'cFans': each_rank = 'fansRank' each_d_rank = 'dFansRank' each_p_rank = 'pFansRank' elif each_key == 'cArchive_view': each_rank = 'archiveViewRank' each_d_rank = 'dArchiveViewRank' each_p_rank = 'pArchiveViewRank' elif each_key == 'cArticle_view': each_rank = 'articleViewRank' each_d_rank = 'dArticleViewRank' each_p_rank = 'pArticleViewRank' for each_author in authors: t.current_value += 1 logger.info("计算{}排名".format(each_author['mid'])) # 如果没有data 直接下一个 if each_key in each_author: # 如果已经计算过rank if 'rank' in each_author: rank = each_author['rank'] if each_rank in each_author['rank']: rank[each_d_rank] = each_author['rank'][each_rank] - i else: rank[each_d_rank] = 0 rank[each_rank] = i rank[each_p_rank] = format_p_rank(i, count) else: # 初始化 rank = { each_rank: i, each_d_rank: 0, each_p_rank: format_p_rank(i, count) } if each_author[each_key] == 0: if 'rank' in each_author: rank = each_author['rank'] rank[each_d_rank] = 0 rank[each_rank] = -1 rank[each_p_rank] = -1 else: rank = {each_rank: -1, each_d_rank: 0, each_p_rank: -1} if each_key == 'cArticle_view': rank['updateTime'] = datetime.datetime.now() coll.update_one({'mid': each_author['mid']}, {'$set': { 'rank': rank, }}) i += 1 t.current_value = t.total_value logger.info("计算作者排名结束")