def crawl_save_fans(uk): max_limit = 25 start = 0 url = "http://yun.baidu.com/pcloud/friend/getfanslist?limit=" + str( max_limit) + "&query_uk=" + uk d = fetch_data(url, 0) start += max_limit #++ tc = d.get('total_count', 0) while tc > start: #by page ls = d.get('fans_list', []) for d in ls: try: bdp_db.db_exec( "insert", bdp_db.insert_fans, [uk, d.get('fans_uk', ''), d.get('follow_time', '')]) #insert new bdp_db.db_exec("insert", "insert into bdp_new_uk(uk)values(%s);", [d.get('fans_uk', '')]) #insert new except Exception as e: logger.error("skip insert.err:" + str(e)) d = fetch_data(url, start) start += max_limit #++ logger.info("crawl_save_follow end.total_count=" + str(tc))
def loop_fans(uk): uks = bdp_db.db_exec("list", "select fans_uk from fans_list where uk='%s';", uk) logger.debug(uks) if uks != 0: for uk in uks: do_crawl_rel(uk[0])
def crawl_save_share(uk): max_limit = 100 start = 0 url = "http://yun.baidu.com/pcloud/feed/getsharelist?auth_type=1&limit=" + str( max_limit) + "&query_uk=" + uk d = fetch_data(url, start) start += max_limit #++ tc = d.get('total_count', 0) while tc > start: #by page ls = d.get('records', []) for d in ls: category_6_cnt = d.get('category_6_cnt', -1) if category_6_cnt == -1: category_6_cnt = d.get('category_4_cnt', -1) try: bdp_db.db_exec("insert",bdp_db.insert_share,[d.get('uk',''),d.get('shareid',''),d.get('feed_type',''),d.get('category',''),d.get('public',''),d.get('data_id',''),d.get('title',''),d.get('third',''),d.get('clienttype','')\ ,d.get('filecount',''),d.get('username',''),d.get('feed_time',''),d.get('desc',''),d.get('avatar_url',''),category_6_cnt,d.get('source_uid',''),d.get('source_id',''),d.get('shorturl',''),d.get('vCnt',''),d.get('dCnt',''),d.get('tCnt',''),d.get('like_status',''),d.get('like_count',''),d.get('comment_count','')]) #insert new except Exception as e: logger.warn("skip insert.err:" + str(e)) #save files filelist = d.get('filelist', []) for f in filelist: try: bdp_db.db_exec("insert",bdp_db.insert_file,[d.get('shareid',''),f.get('server_filename',''),f.get('category',''),\ f.get('isdir',''),f.get('size',''),f.get('fs_id',''),f.get('path',''),f.get('md5',''),f.get('sign',''),f.get('time_stamp','')]) #insert new except Exception as e: logger.error("skip insert.err:" + str(e)) d = fetch_data(url, start) start += max_limit #++ logger.debug("Success.total_count=" + str(tc))
def crawl_save_user(uk): d = fetch_data("http://yun.baidu.com/pcloud/user/getinfo?query_uk=" + uk, 0) d = d.get('user_info', None) if d != None: try: d = [ d.get('avatar_url', ''), d.get('fans_count', ''), d.get('follow_count', ''), d.get('album_count', ''), d.get('intro', ''), d.get('uname', ''), d.get('uk', ''), d.get('pubshare_count', ''), d.get('tui_user_count', ''), d.get('c2c_user_sell_count', ''), d.get('c2c_user_buy_count', ''), d.get('c2c_user_product_count', ''), d.get('pair_follow_type', '') ] bdp_db.db_exec("insert", bdp_db.insert_user, d) #insert new except Exception as e: logger.error("skip insert.err:" + str(e))
def do_crawl(): while 1: uks = bdp_db.db_exec("list", "select uk from bdp_new_uk limit %s;", "20") if (len(uks) > 0): for uk in uks: uk = uk[0] if isExists(uk): logger.warn("skip exits uk:" + uk) else: crawl_save_share(uk) crawl_save_user(uk) #delete whether success or not bdp_db.db_exec("delete", "delete from bdp_new_uk where uk=" + str(uk), uk) else: global miss_times miss_times += 1 if (miss_times > 10): break sleep(2) logger.warn("bdp_new_uk empty,waiting 2s...>>" + str(miss_times))
def isExists(uk): logger.debug("isExists:" + uk) if bdp_db.db_exec("count", "select count(*) from user_info where uk=%s;", uk) > 0: return 1 return 0
bdp_db.db_exec("delete", "delete from bdp_new_uk where uk=" + str(uk), uk) else: global miss_times miss_times += 1 if (miss_times > 10): break sleep(2) logger.warn("bdp_new_uk empty,waiting 2s...>>" + str(miss_times)) #do_crawl("891489109") #do_crawl("490155926") uk = bdp_db.db_exec("list", "select uk from bdp_new_uk limit %s;", "1")[0][0] # 创建两个线程 threads = [] t1 = threading.Thread(target=do_crawl_rel, args=(str(uk), )) threads.append(t1) bdp_db.db_exec("insert", "insert into bdp_new_uk(uk)values(%s);", [uk]) #insert new t2 = threading.Thread(target=do_crawl, args=()) threads.append(t2) for t in threads: t.setDaemon(True) t.start() t.join()