Пример #1
0
def compute_video_rank_table():
    task_name = '计算视频排名对照表'
    coll = db['video']  # 获得collection的句柄
    count = coll.estimated_document_count()
    top_n = 60
    print(count)
    keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare']
    task = ProgressTask(task_name, top_n * len(keys), collection=db['tracer'])
    o = {}
    skip = int(count / 100)
    for each_key_index in range(len(keys)):
        each_key = keys[each_key_index]
        o[each_key] = {}
        o['name'] = 'video_rank'
        o[each_key]['rate'] = []
        i = 1
        last_value = 9999999999
        # logger.info("开始计算视频{}排名对照表".format(each_key))
        video = coll.find({}, {
            'title': 1
        }).limit(200).sort(each_key, DESCENDING).batch_size(200)
        top = 1
        for each_video in list(video):
            o[each_key][each_video['title']] = top
            top += 1

        while i <= top_n:
            task.current_value = i + top_n * each_key_index
            video = list(
                coll.find({
                    each_key: {
                        '$lt': last_value
                    }
                }, {
                    each_key: 1
                }).limit(1).skip(skip).sort(each_key, DESCENDING))
            print(video)
            if len(video) != 0:
                video = video[0]
            else:
                i += 1
                continue
            if each_key not in video:
                break
            last_value = video[each_key]
            o[each_key]['rate'].append(last_value)
            print(last_value)
            i += 1
    o['update_time'] = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
    output_coll = db['rank_table']
    output_coll.update_one({'name': 'video_rank'}, {'$set': o}, upsert=True)
Пример #2
0
def gen_online():
    task_name = "生成在线人数爬取链接"
    t = ProgressTask(task_name, 1, collection=db['tracer'])
    ONLINE_URL = 'https://www.bilibili.com/video/online.html'
    redis_connection.rpush("online:start_urls", ONLINE_URL)
    t.current_value = 1
Пример #3
0
def author_fans_rate_caculate():

    logging.basicConfig(
        level=logging.INFO,
        format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s')
    logger = logging.getLogger(__name__)

    coll = db['author']  # 获得collection的句柄
    logger.info('开始计算粉丝增速')

    c_datetime = datetime.datetime.now()

    end_date = (
        datetime.datetime(c_datetime.year, c_datetime.month, c_datetime.day) -
        datetime.timedelta(1)).timestamp()
    start_date = (
        datetime.datetime(c_datetime.year, c_datetime.month, c_datetime.day) -
        datetime.timedelta(2)).timestamp()

    task = ProgressTask("计算粉丝增速",
                        coll.count_documents({}),
                        collection=db['tracer'])

    c = 0
    for each in coll.find({}, {'mid': 1, '_id': 0}).batch_size(200):
        c += 1
        task.current_value = c
        ag = coll.aggregate([{
            '$match': {
                'mid': each['mid']
            }
        }, {
            '$project': {
                'mid': 1,
                'data': {
                    "$filter": {
                        "input": "$data",
                        "as": "each_data",
                        "cond": {
                            "$gt": [
                                "$$each_data.datetime",
                                datetime.datetime.now() - datetime.timedelta(7)
                            ]
                        }
                    }
                }
            }
        }]).batch_size(1)
        each_author = next(ag)
        if 'data' in each_author and each_author['data'] != None:
            data = sorted(each_author['data'], key=lambda x: x['datetime'])
            if len(data) >= 2:
                logger.info(each_author['mid'])
                x = tuple(map(lambda x: x['datetime'].timestamp(), data))
                y = tuple(map(lambda x: x['fans'], data))
                inter_fun = interp1d(x, y, kind='linear')
                if start_date > x[0] and end_date < x[-1]:
                    inter_data = inter_fun([start_date, end_date])
                    delta_fans = inter_data[1] - inter_data[0]
                    coll.update_one({'mid': each_author['mid']},
                                    {"$set": {
                                        'cRate': int(delta_fans)
                                    }})
Пример #4
0
def calculate_author_rank():
    task_name = "计算作者排名数据"
    keys = ['cFans', 'cArchive_view', 'cArticle_view']
    count = coll.count_documents({keys[0]: {'$exists': 1}})
    t = ProgressTask(task_name, count * len(keys), collection=db['tracer'])
    for each_key in keys:
        logger.info("开始计算作者{}排名".format(each_key))
        i = 1
        authors = coll.find({
            each_key: {
                '$exists': 1
            }
        }, {
            'mid': 1,
            'rank': 1,
            each_key: 1
        }).batch_size(300).sort(each_key, DESCENDING)
        if each_key == 'cFans':
            each_rank = 'fansRank'
            each_d_rank = 'dFansRank'
            each_p_rank = 'pFansRank'
        elif each_key == 'cArchive_view':
            each_rank = 'archiveViewRank'
            each_d_rank = 'dArchiveViewRank'
            each_p_rank = 'pArchiveViewRank'
        elif each_key == 'cArticle_view':
            each_rank = 'articleViewRank'
            each_d_rank = 'dArticleViewRank'
            each_p_rank = 'pArticleViewRank'
        for each_author in authors:
            t.current_value += 1
            logger.info("计算{}排名".format(each_author['mid']))
            # 如果没有data 直接下一个
            if each_key in each_author:
                # 如果已经计算过rank
                if 'rank' in each_author:
                    rank = each_author['rank']
                    if each_rank in each_author['rank']:
                        rank[each_d_rank] = each_author['rank'][each_rank] - i
                    else:
                        rank[each_d_rank] = 0
                    rank[each_rank] = i
                    rank[each_p_rank] = format_p_rank(i, count)
                else:
                    # 初始化
                    rank = {
                        each_rank: i,
                        each_d_rank: 0,
                        each_p_rank: format_p_rank(i, count)
                    }
            if each_author[each_key] == 0:
                if 'rank' in each_author:
                    rank = each_author['rank']
                    rank[each_d_rank] = 0
                    rank[each_rank] = -1
                    rank[each_p_rank] = -1
                else:
                    rank = {each_rank: -1, each_d_rank: 0, each_p_rank: -1}
            if each_key == 'cArticle_view':
                rank['updateTime'] = datetime.datetime.now()
            coll.update_one({'mid': each_author['mid']},
                            {'$set': {
                                'rank': rank,
                            }})
            i += 1
    t.current_value = t.total_value
    logger.info("计算作者排名结束")