def insert_proxies(proxies_list): if not proxies_list: return con = MyPyMysql(**mysql_config) sql = """ replace into pt_db.spide_proxies_ip (proxy_host,proxy_port) values %s """ con.insert_query(sql, proxies_list) mylog.info('insert :' + sql + str(proxies_list)) con.close_connect()
def getfollowlist(current_uk): @gen.coroutine def followquery(current_uk, start, limit): query_uk = current_uk url = follow_url.format(start, limit, query_uk) mylog.info('follow: ' + url) response = yield get_spide(url) list_data = json.loads(response.body) if list_data['errno'] != 0: yield followquery(current_uk, start, limit) raise gen.Return(response.body) total_count = list_data['total_count'] follow_list = list_data['follow_list'] raise gen.Return([total_count, follow_list]) try: start = 0 limit = 24 url = follow_url.format(start, limit, current_uk) list_data = yield followquery(current_uk, start, limit) total_count = list_data[0] follow_list = [] person_data = [] con = MyPyMysql(**mysql_config) sql = """select follow_nums from pt_db.spide_all_person_log where uk = %s """ query_data = con.query(sql, current_uk) query_follows = query_data[0]['follow_nums'] if query_data else 0 for_follows = total_count - query_follows # 增量更新,差异多少 if for_follows > 0: for j in range(((for_follows - 1) / limit) + 1): start = j * limit url = follow_url.format(start, limit, current_uk) range_data = yield followquery(current_uk, start, limit) follow_list.extend(range_data[1]) for i in follow_list: person_data.append([ i['follow_uk'], i['follow_uname'].encode("utf-8"), i['fans_count'], i['follow_count'], i['pubshare_count'] ]) # 把所有查到的人添加到数据库 if person_data: sql = """ replace into pt_db.spide_all_person (uk,uk_name,fan_nums,follow_nums,share_nums) values %s """ con.insert_query(sql, person_data) # 记录查询的这个人当前的关注人数 sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums) values (%s,%s) ON DUPLICATE KEY UPDATE follow_nums=%s , m_time = %s;""" con.query(sql, (current_uk, total_count, total_count, now_time)) # 查询没有fetching的用户uk sql = """ SELECT p.uk FROM pt_db.spide_all_person p where p.follow_nums !=0 """ query_uk = con.query(sql) uks = [i['uk'] for i in query_uk] con.close_connect() except Exception as e: mylog.error('followlist 失败: ' + str(url)) mylog.error(e) raise gen.Return([]) raise gen.Return(uks)
def delete_proxies(proxies_list): if not proxies_list: return pmysql = MyPyMysql(**mysql_config) change_proxies = '"' + '","'.join(proxies_list) + '"' sql = """delete from pt_db.spide_proxies_ip where proxy_host in (%s) ;""" sql = sql % change_proxies pmysql.query(sql) mylog.info('update :' + sql) pmysql.close_connect()
def put_ip(): pmysql = MyPyMysql(**mysql_config) sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip order by rand();""" result = pmysql.query(sql) for i in result: r.rpush("proxy_ip_list", json.dumps(i)) mylog.info('向proxy_ip_list加数据') pmysql.close_connect() if not result or result is None: mylog.info('数据库无代理IP...') yield get_first_proxy_data(page_n=2,if_proxy=True)
def put_ip(): pmysql = MyPyMysql(**mysql_config) sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip where status = 0 ;""" result = pmysql.query(sql) for i in result: if i['proxy_host'] not in proxy_ip_set: proxy_ip_set.add(i['proxy_host']) proxy_ip_queue.put(i) pmysql.close_connect() if not result or result is None: os._exit(0)
def get_all_person(uk): """ 获取所有用户当前粉丝数,分享数,关注数 ,代理连接数 :return: """ pmysql = MyPyMysql(**mysql_config) sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip where status = 0 order by rand() desc ;""" result = pmysql.query(sql) # 代理连接数 sql = """SELECT ifnull(p.follow_nums,0)-ifnull(l.follow_nums,0) as follow_nums FROM pt_db.spide_all_person p left join pt_db.spide_all_person_log l on p.uk = l.uk where p.share_nums !=0 and p.uk = %s """ all_person = pmysql.query(sql, uk) # uk_data = int(all_person[0]['follow_nums']) if all_person else 0 pmysql.close_connect() return result, uk_data
def getsharelist(current_uk, start, limit): try: # yield gen.sleep(consume_sleeptime) auth_type = 1 query_uk = current_uk url = share_url.format(auth_type, start, limit, query_uk) response = yield get_spide(url) list_data = json.loads(response.body) if list_data['errno'] != 0: yield getsharelist(current_uk, start, limit) raise gen.Return(response.body) records = list_data['records'] if 'records' in list_data else [] insert_data = [] for i in records: if i['feed_type'] == 'share': for j in i['filelist']: insert_data.append([ j['fs_id'], j['category'], 'http://pan.baidu.com/s/' if 'shorturl' in i.keys() else 'http://pan.baidu.com/share/link?uk={0}&shareid='. format(current_uk), i['shorturl'].encode("utf-8") if 'shorturl' in i.keys() else i['shareid'].encode("utf-8"), i['public'].encode("utf-8"), j['server_filename'].encode("utf-8"), i['uk'], i['username'].encode("utf-8"), j['size'], timestamptotime(i['feed_time'] / 1000) ]) len_insert_data = len(records) # print len_insert_data,insert_data con = MyPyMysql(**mysql_config) if insert_data: sql = """ insert ignore into pt_db.spide_shares (fs_id,category,base_url,share_url,`public`,server_filename,uk,username,`size`,share_time) values %s """ con.insert_query(sql, insert_data) # 记录查询的这个人现在分享到多少了 sql = """ insert into pt_db.spide_all_person_log (uk,share_nums) values (%s,%s) ON DUPLICATE KEY UPDATE share_nums=share_nums+%s , m_time = %s;""" con.query(sql, (current_uk, len_insert_data, len_insert_data, now_time)) con.close_connect() mylog.info('sharelist 成功: ' + url) except Exception as e: mylog.error('sharelist 失败: ' + str(url)) mylog.error(e) raise gen.Return([]) raise gen.Return([])
yield getsharelist(current_uk, starts, limit) except Exception as e: mylog.error(e) finally: share_data.task_done() start = time.time() q.put(base_uk) share_data.put(base_uk) worker() # Start workers, then wait for the work queue to be empty. for i in range(3): consumer() yield q.join() yield share_data.join() # assert fetching == fetched print('Done in %d seconds, fetched %s URLs.' % (time.time() - start, len(fetched))) if __name__ == '__main__': mylog = Logger(main_logging_filename) mylog.info('爬虫开始....') con = MyPyMysql(**mysql_config) sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums,fan_nums,share_nums) values (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE follow_nums=follow_nums , m_time = %s;""" con.query(sql, [2164327417, 0, 0, 0, now_time]) con.close_connect() io_loop = ioloop.IOLoop.current().run_sync(main)