示例#1
0
文件: bdp_crawl.py 项目: muzidudu/BDP
def crawl_save_fans(uk):
    max_limit = 25
    start = 0
    url = "http://yun.baidu.com/pcloud/friend/getfanslist?limit=" + str(
        max_limit) + "&query_uk=" + uk
    d = fetch_data(url, 0)
    start += max_limit  #++
    tc = d.get('total_count', 0)

    while tc > start:  #by page
        ls = d.get('fans_list', [])
        for d in ls:
            try:
                bdp_db.db_exec(
                    "insert", bdp_db.insert_fans,
                    [uk, d.get('fans_uk', ''),
                     d.get('follow_time', '')])  #insert new
                bdp_db.db_exec("insert",
                               "insert into bdp_new_uk(uk)values(%s);",
                               [d.get('fans_uk', '')])  #insert new
            except Exception as e:
                logger.error("skip insert.err:" + str(e))

        d = fetch_data(url, start)
        start += max_limit  #++

    logger.info("crawl_save_follow end.total_count=" + str(tc))
示例#2
0
文件: bdp_crawl.py 项目: muzidudu/BDP
def loop_fans(uk):
    uks = bdp_db.db_exec("list",
                         "select fans_uk from fans_list where uk='%s';", uk)
    logger.debug(uks)
    if uks != 0:
        for uk in uks:
            do_crawl_rel(uk[0])
示例#3
0
文件: bdp_crawl.py 项目: muzidudu/BDP
def crawl_save_share(uk):
    max_limit = 100
    start = 0

    url = "http://yun.baidu.com/pcloud/feed/getsharelist?auth_type=1&limit=" + str(
        max_limit) + "&query_uk=" + uk
    d = fetch_data(url, start)
    start += max_limit  #++
    tc = d.get('total_count', 0)

    while tc > start:  #by page

        ls = d.get('records', [])

        for d in ls:
            category_6_cnt = d.get('category_6_cnt', -1)
            if category_6_cnt == -1:
                category_6_cnt = d.get('category_4_cnt', -1)

            try:
                bdp_db.db_exec("insert",bdp_db.insert_share,[d.get('uk',''),d.get('shareid',''),d.get('feed_type',''),d.get('category',''),d.get('public',''),d.get('data_id',''),d.get('title',''),d.get('third',''),d.get('clienttype','')\
                                                             ,d.get('filecount',''),d.get('username',''),d.get('feed_time',''),d.get('desc',''),d.get('avatar_url',''),category_6_cnt,d.get('source_uid',''),d.get('source_id',''),d.get('shorturl',''),d.get('vCnt',''),d.get('dCnt',''),d.get('tCnt',''),d.get('like_status',''),d.get('like_count',''),d.get('comment_count','')]) #insert new
            except Exception as e:
                logger.warn("skip insert.err:" + str(e))

            #save files
            filelist = d.get('filelist', [])

            for f in filelist:
                try:
                    bdp_db.db_exec("insert",bdp_db.insert_file,[d.get('shareid',''),f.get('server_filename',''),f.get('category',''),\
                                                                f.get('isdir',''),f.get('size',''),f.get('fs_id',''),f.get('path',''),f.get('md5',''),f.get('sign',''),f.get('time_stamp','')]) #insert new
                except Exception as e:
                    logger.error("skip insert.err:" + str(e))

        d = fetch_data(url, start)
        start += max_limit  #++

    logger.debug("Success.total_count=" + str(tc))
示例#4
0
文件: bdp_crawl.py 项目: muzidudu/BDP
def crawl_save_user(uk):
    d = fetch_data("http://yun.baidu.com/pcloud/user/getinfo?query_uk=" + uk,
                   0)
    d = d.get('user_info', None)
    if d != None:
        try:
            d = [
                d.get('avatar_url', ''),
                d.get('fans_count', ''),
                d.get('follow_count', ''),
                d.get('album_count', ''),
                d.get('intro', ''),
                d.get('uname', ''),
                d.get('uk', ''),
                d.get('pubshare_count', ''),
                d.get('tui_user_count', ''),
                d.get('c2c_user_sell_count', ''),
                d.get('c2c_user_buy_count', ''),
                d.get('c2c_user_product_count', ''),
                d.get('pair_follow_type', '')
            ]
            bdp_db.db_exec("insert", bdp_db.insert_user, d)  #insert new
        except Exception as e:
            logger.error("skip insert.err:" + str(e))
示例#5
0
文件: bdp_crawl.py 项目: muzidudu/BDP
def do_crawl():
    while 1:
        uks = bdp_db.db_exec("list", "select uk from bdp_new_uk limit %s;",
                             "20")
        if (len(uks) > 0):
            for uk in uks:
                uk = uk[0]
                if isExists(uk):
                    logger.warn("skip exits uk:" + uk)
                else:
                    crawl_save_share(uk)
                    crawl_save_user(uk)

                #delete whether success or not
                bdp_db.db_exec("delete",
                               "delete from bdp_new_uk where uk=" + str(uk),
                               uk)
        else:
            global miss_times
            miss_times += 1
            if (miss_times > 10):
                break
            sleep(2)
            logger.warn("bdp_new_uk empty,waiting 2s...>>" + str(miss_times))
示例#6
0
文件: bdp_crawl.py 项目: muzidudu/BDP
def isExists(uk):
    logger.debug("isExists:" + uk)
    if bdp_db.db_exec("count", "select count(*) from user_info where uk=%s;",
                      uk) > 0:
        return 1
    return 0
示例#7
0
文件: bdp_crawl.py 项目: muzidudu/BDP
                bdp_db.db_exec("delete",
                               "delete from bdp_new_uk where uk=" + str(uk),
                               uk)
        else:
            global miss_times
            miss_times += 1
            if (miss_times > 10):
                break
            sleep(2)
            logger.warn("bdp_new_uk empty,waiting 2s...>>" + str(miss_times))


#do_crawl("891489109")
#do_crawl("490155926")

uk = bdp_db.db_exec("list", "select uk from bdp_new_uk limit %s;", "1")[0][0]
# 创建两个线程
threads = []
t1 = threading.Thread(target=do_crawl_rel, args=(str(uk), ))
threads.append(t1)

bdp_db.db_exec("insert", "insert into bdp_new_uk(uk)values(%s);",
               [uk])  #insert new
t2 = threading.Thread(target=do_crawl, args=())
threads.append(t2)

for t in threads:
    t.setDaemon(True)
    t.start()

t.join()