def parse_total_page(self, response): analyzer = Analyzer() total_pq = analyzer.get_html(response.body, 'script:contains("W_pages")') friendcircle_analyzer = keyword_info_analyzer() total_pages = friendcircle_analyzer.get_totalpages( total_pq) #需要爬取的微博朋友圈页数 logger.info("the total_pages is: %d", total_pages) getweibopage = GetWeibopage() mainpage_url = response.meta['mainpage_url'] user_id = response.meta['uid'] is_search = response.meta['is_search'] for page in range(1): #TODO 此处要更改为total_pages GetWeibopage.data['uid'] = user_id GetWeibopage.data['page'] = page + 1 firstload_url = mainpage_url + getweibopage.get_firstloadurl() yield Request(url=firstload_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': user_id, 'is_search': is_search }, callback=self.parse_load) secondload_url = mainpage_url + getweibopage.get_secondloadurl() #yield Request(url=secondload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load) thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
def start_getweiboinfo(self,response): db = OracleStore();conn = db.get_connection() sql1 = '''select * from t_user_info''' cursor1 = db.select_operation(conn,sql1) sql2 = '''select count(*) from t_user_info''' cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() if count[0]: for i in range(count[0]): for result in cursor1.fetchmany(1): mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) else: yield None db.close_connection(conn,cursor1,cursor2)
def start_getweiboinfo(self,response): # db = OracleStore() # conn = db.get_connection() # sql1 = '''select * from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor1 = db.select_operation(conn,sql1) # # sql2 = '''select count(*) from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor2 = db.select_operation(conn,sql2) # count = cursor2.fetchone() # # if count[0]: # for i in range(1): #(count[0]): # for result in cursor1.fetchmany(1): # if result[0]: mainpageurl = 'http://weibo.com/u/'+str(self.uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid #result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)
def get_userinfo(self, response): db = OracleStore() conn = db.get_connection() sql1 = "select * from t_user_info where imagestate = 0" cursor1 = db.select_operation(conn, sql1) sql2 = "select count(*) from t_user_info where imagestate = 0" cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() for i in range(count[0]): for result in cursor1.fetchmany(1): if result[0]: mainpageurl = 'http://weibo.com/u/' + str( result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] #result[1] getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.get_userurl)
def start_getweiboinfo(self,response): db = MysqlStore() conn = db.get_connection() sql1 = "select * from t_user_follow where contentstate = 0" cursor1 = db.select_operation(conn,sql1) sql2 = "select count(*) from t_user_follow where contentstate = 0" cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() for i in range(10): #(count[0]): #count[0]为需要再爬取的用户数 for result in cursor1.fetchmany(1): if result[1]: mainpageurl = 'http://weibo.com/u/'+str(result[1])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[1] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_firstload) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_secondload) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_thirdload)
def start_getweiboinfo(self, response): mainpageurl = 'http://weibo.com/u/' + str( self.uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid getweibopage = GetWeibopage() for page in range(int(self.per_page_num), int(self.per_page_num) + 2): GetWeibopage.data['page'] = page firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load)
def get_userinfo(self,response): mainpageurl = 'http://weibo.com/u/'+str(1227086635)+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = 1227086635 getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 #firstloadurl = mainpageurl + getweibopage.get_firstloadurl() thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar']},callback=self.get_userurl)
def get_relation(self,response): '''获取用户粉丝或关注请求''' getweibopage = GetWeibopage() for page in range(WeiboSpider.follow_page_num,0,-1): GetWeibopage.relation_data['page'] = page follow_url = getinfo.get_follow_mainurl(self.uid) + getweibopage.get_relation_paramurl() yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_follow) for page in range(WeiboSpider.follower_page_num,0,-1): GetWeibopage.relation_data['page'] = page follower_url = getinfo.get_follower_mainurl(self.uid) + getweibopage.get_relation_paramurl() yield Request(url=follower_url,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_follower)
def get_userinfo(self,response): #db = OracleStore();conn = db.get_connection() for uid in self.uid_list: #sql = "select count(*) from (select userID from t_user_info where userID='%s' union select userID from t_publicuser_info where userID='%s')" % (uid,uid) #cursor = db.select_operation(conn,sql);count = cursor.fetchone() #if not count[0]: #没有爬取过该uid用户 print "!!scraping each uid:",uid mainpageurl = 'http://weibo.com/u/'+str(uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = uid getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':uid},callback=self.get_userurl)
def start_getweiboinfo(self, response): db = OracleStore() conn = db.get_connection() sql1 = '''select * from t_user_info''' cursor1 = db.select_operation(conn, sql1) sql2 = '''select count(*) from t_user_info''' cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() if count[0]: for i in range(count[0]): for result in cursor1.fetchmany(1): mainpageurl = 'http://weibo.com/u/' + str( result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl( ) yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl( ) yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) else: yield None db.close_connection(conn, cursor1, cursor2)
def start_getweiboinfo(self,response): mainpageurl = 'http://weibo.com/u/'+str(self.uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)
def parse_userinfo(self,response): item = response.meta['item'] #f=open('./text2.html','w') #f.write(response.body) analyzer = Analyzer() total_pq = analyzer.get_html(response.body,'script:contains("PCD_text_b")') #userinfo_dict = analyzer.get_userinfo(total_pq) item['userinfo'] = analyzer.get_userinfo(total_pq) #uid = item['uid'] mainpageurl = 'http://weibo.com/u/'+str(response.meta['uid'])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = response.meta['uid'] #uid getweibopage = GetWeibopage() GetWeibopage.data['page'] = WeiboSpider.page_num-1 thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':response.meta['uid'],'followlist':response.meta['followlist']},callback=self.parse_thirdload)
def start_getweiboinfo(self, response): db = MysqlStore() conn = db.get_connection() sql1 = "select * from t_user_follow where contentstate = 0" cursor1 = db.select_operation(conn, sql1) sql2 = "select count(*) from t_user_follow where contentstate = 0" cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() for i in (count[0]): #count[0]为需要再爬取的用户数 for result in cursor1.fetchmany(1): if result[1]: mainpageurl = 'http://weibo.com/u/' + str( result[1]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[1] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_firstload) secondloadurl = mainpageurl + getweibopage.get_secondloadurl( ) yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_secondload) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl( ) yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_thirdload)
def parse_follow(self,response): #print '************************ source request url:',response.request.url item = WeibospiderItem() analyzer = Analyzer() total_pq = analyzer.get_followhtml(response.body) #item['followuidlist'] = analyzer.get_follow(total_pq) followlist = analyzer.get_follow(total_pq) #item['userinfo'] = {} oldflag,stopflag= getinfo.get_followflag(WeiboSpider.filename) p = re.compile('.*_page=(\d).*',re.S) current_page = p.search(response.request.url).group(1) #获取当前关注用户列表页页数 if int(current_page) == 1: getinfo.set_followflag(WeiboSpider.filename,followlist[0],'False') print 'page is equal 1 ' else: print 'page is NOT equal 1' for follow_uid in followlist[:2]: print '%%%%%%%%%%%%%%%%%%%%%%%%%%',follow_uid #item['uid'] = follow_uid if follow_uid != oldflag: #对于已爬uid不进行重复爬取,即增量爬取 #爬取该uid用户主页微博内容 if stopflag == 'False': getinfo.set_followflag(WeiboSpider.filename,followlist[0],'True') mainpageurl = 'http://weibo.com/u/'+str(follow_uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = follow_uid getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 #当页第一次加载 #当页第二次加载 #当页第三次加载 thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() if int(GetWeibopage.data['pagebar']) == 1 and page == WeiboSpider.page_num-1: #在最后一页最后一次加载时,获取用户基本信息 print 'hhhhhhhhhhhhhhhhhhhh',followlist yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid,'followlist':followlist},callback=self.get_userurl) #continue #yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid},callback=self.parse_thirdload) #firstloadurl = mainpageurl + getweibopage.get_firstloadurl() #yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid},callback=self.parse_firstload) else: break else: break
def parse_follower(self, response): item = WeibospiderItem() analyzer = Analyzer() getweibopage = GetWeibopage() total_follower_pq = analyzer.get_followerhtml(response.body) item['uid'] = response.meta['uid'] item['follower_uid_list'] = analyzer.get_follower(total_follower_pq) item['follow_uid_list'] = [] yield item #获取二级(粉丝)用户的关注和粉丝 if self.uid == response.meta['uid'] and len(item['follower_uid_list']): db = OracleStore() conn = db.get_connection() for follower_uid in item['follower_uid_list']: #获取粉丝用户的关注用户 sql1 = """select count(*) from t_user_follow where userID=%s""" % str( follower_uid) cursor1 = db.select_operation(conn, sql1) count1 = cursor1.fetchone() follower_scraped = count1[0] cursor1.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follow_url = 'http://weibo.com/%s/follow?page=1' % str( follower_uid) yield Request(url=follow_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': follower_uid }, dont_filter=True, callback=self.parse_based_follownum) else: print 'follow_uid existed!', follower_uid yield None #获取粉丝用户的粉丝用户 sql2 = """select count(*) from t_user_follower where userID=%s""" % str( follower_uid) cursor2 = db.select_operation(conn, sql2) count2 = cursor2.fetchone() follower_scraped = count2[0] cursor2.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follower_url = 'http://weibo.com/%s/fans?page=1' % str( follower_uid) yield Request(url=follower_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': follower_uid }, dont_filter=True, callback=self.parse_based_followernum) else: print 'follower_uid existed!', follower_uid yield None conn.close()
def parse_follower(self,response): item = WeibospiderItem() analyzer = Analyzer() getweibopage = GetWeibopage() total_follower_pq = analyzer.get_followerhtml(response.body) item['uid'] = response.meta['uid'] item['follower_uid_list'] = analyzer.get_follower(total_follower_pq) item['follow_uid_list'] = [] yield item if self.uid == response.meta['uid'] and len(item['follower_uid_list']): db = OracleStore() conn = db.get_connection() for follower_uid in item['follower_uid_list']: #获取粉丝用户的关注用户 sql1 = """select count(*) from t_user_follow where userID=%s""" % str(follower_uid) cursor1 = db.select_operation(conn,sql1) count1 = cursor1.fetchone() follower_scraped = count1[0] cursor1.close() if not follower_scraped: #scraped为0,即该账户没有获取过 for page in range(WeiboSpider.follow_page_num,0,-1): GetWeibopage.relation_data['page'] = page follow_url = getinfo.get_follow_mainurl(follower_uid) + getweibopage.get_relation_paramurl() yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},callback=self.parse_follow) else: print 'follow_uid existed!',follower_uid yield None #获取粉丝用户的粉丝用户 sql2 = """select count(*) from t_user_follower where userID=%s""" % str(follower_uid) cursor2 = db.select_operation(conn,sql2) count2 = cursor2.fetchone() follower_scraped = count2[0] cursor2.close() if not follower_scraped: #scraped为0,即该账户没有获取过 for page in range(WeiboSpider.follower_page_num,0,-1): GetWeibopage.relation_data['page'] = page follower_url = getinfo.get_follower_mainurl(follower_uid) + getweibopage.get_relation_paramurl() yield Request(url=follower_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},callback=self.parse_follower) else: print 'follower_uid existed!',follower_uid yield None conn.close()
def start_getweibo_info(self,response): db = MysqlStore(); #取出没有爬取过的且is_delete=0的重点人员 GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage() for round in range(1): #遍历数据库的轮数 conn = db.get_connection() sql1 = "select user_id from cauc_warning_man_test a \ where a.is_search = 0 and a.is_delete = 0" cursor = db.select_operation(conn,sql1) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the unsearched user_id:%s",user_id) #更新is_search标志位为1 sql2 = "update cauc_warning_man_test set is_search = 1 where user_id = '%s'" % user_id db.update_operation(conn,sql2) #获取需要爬取的总页面数 start_time = self.start_time;end_time = get_current_time('hour') mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page) logger.info("current timestamp:%d",int(time.time())) #设置循环爬取间隔 time.sleep(WeiboSpider.settings['WEIBOCONTENT_INTERVAL']) #可以采用间隔15min #取出已经爬取过is_search=1的且is_delete=0的预警人员 sql3 = "select user_id from cauc_warning_man_test a \ where a.is_search = 1 and a.is_delete = 0" cursor = db.select_operation(conn,sql3) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the searched user_id:%s",user_id) start_time = get_time_by_interval(int(time.time()),86400,'hour');end_time = get_current_time('hour') #起始和结束间隔时间为1天(86400s) mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() #yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page) conn.close()
def start_getweibo_info(self,response): db = MysqlStore(); #取出没有爬取过的且is_delete=0的重点人员 GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage() for round in range(1): #遍历数据库的轮数 conn = db.get_connection() sql1 = "select user_id from cauc_black_man_test a \ where a.is_search = 0 and a.is_delete = 0" cursor = db.select_operation(conn,sql1) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the unsearched user_id:%s",user_id) #更新is_search标志位为1 sql2 = "update cauc_black_man_test set is_search = 1 where user_id = '%s'" % user_id db.update_operation(conn,sql2) #获取需要爬取的总页面数 start_time = self.start_time;end_time = get_current_time('hour') mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page) logger.info("current timestamp:%d",int(time.time())) #设置循环爬取间隔 time.sleep(WeiboSpider.settings['FRIENDCIRCAL_INTERVAL']) #可以采用间隔15min #取出已经爬取过is_search=1的且is_delete=0的重点人员 sql3 = "select user_id from cauc_black_man_test a \ where a.is_search = 1 and a.is_delete = 0" cursor = db.select_operation(conn,sql3) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the searched user_id:%s",user_id) start_time = get_time_by_interval(int(time.time()),86400,'hour');end_time = get_current_time('hour') #起始和结束间隔时间为1天(86400s) mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() #yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page) conn.close()
def get_userinfo(self,response): db = OracleStore() conn = db.get_connection() sql1 = "select * from t_user_info where imagestate = 0" cursor1 = db.select_operation(conn,sql1) sql2 = "select count(*) from t_user_info where imagestate = 0" cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() for i in range(count[0]): for result in cursor1.fetchmany(1): if result[0]: mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] #result[1] getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.get_userurl)
def get_userinfo(self, response): #db = OracleStore();conn = db.get_connection() for uid in self.uid_list: #sql = "select count(*) from (select userID from t_user_info where userID='%s' union select userID from t_publicuser_info where userID='%s')" % (uid,uid) #cursor = db.select_operation(conn,sql);count = cursor.fetchone() #if not count[0]: #没有爬取过该uid用户 print "!!scraping each uid:", uid mainpageurl = 'http://weibo.com/u/' + str( uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = uid getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': uid }, callback=self.get_userurl)
def start_getweiboinfo(self, response): # db = OracleStore() # conn = db.get_connection() # sql1 = '''select * from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor1 = db.select_operation(conn,sql1) # # sql2 = '''select count(*) from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor2 = db.select_operation(conn,sql2) # count = cursor2.fetchone() # # if count[0]: # for i in range(1): #(count[0]): # for result in cursor1.fetchmany(1): # if result[0]: mainpageurl = 'http://weibo.com/u/' + str( self.uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid #result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load)
def start_getweibo_info(self,response): db = MysqlStore(); #取出没有爬取过的且is_delete=0的重点人员 GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage() #for round in range(1): #遍历数据库的轮数 conn = db.get_connection() sql1 = "select user_id from cauc_warning_man a \ where a.is_search = 0 and a.is_delete = 0" cursor1 = db.select_operation(conn,sql1) for user_id in cursor1.fetchall(): user_id = user_id[0] logger.info("this is the unsearched user_id:%s",user_id) #获取需要爬取的总页面数 start_time = self.start_time;end_time = get_current_time('day') mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,cookies=random.choice(COOKIES),meta={'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page) #取出已经爬取过is_search=1的且is_delete=0的预警人员 sql2 = "select user_id from cauc_warning_man a \ where a.is_search = 1 and a.is_delete = 0" cursor2 = db.select_operation(conn,sql2) for user_id in cursor2.fetchall(): user_id = user_id[0] logger.info("this is the searched user_id:%s",user_id) #start_time = get_time_by_interval(int(time.time()),86400,'day');end_time = get_current_time('day') #起始和结束间隔时间为1天(86400s),即过去一天的内容 start_time = get_time_by_interval(int(time.time()),int(self.interval),'day');end_time = get_current_time('day') #起始和结束间隔时间为x天(由interval代表的秒换算而来) mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,cookies=random.choice(COOKIES),meta={'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page) #更新is_search标志位为1 sql3 = "update cauc_warning_man set is_search = 1 where is_search = 0 and is_delete = 0" db.update_operation(conn,sql3) db.close_connection(conn)
def parse_total_page(self,response): analyzer = Analyzer() total_pq = analyzer.get_html(response.body,'script:contains("W_pages")') friendcircle_analyzer = keyword_info_analyzer() total_pages = friendcircle_analyzer.get_totalpages(total_pq) #需要爬取的微博朋友圈页数 logger.info("the total_pages is: %d",total_pages) getweibopage = GetWeibopage() mainpage_url = response.meta['mainpage_url'] user_id = response.meta['uid'] is_search = response.meta['is_search'] for page in range(1): #TODO 此处要更改为total_pages GetWeibopage.data['uid'] = user_id GetWeibopage.data['page'] = page + 1 firstload_url = mainpage_url + getweibopage.get_firstloadurl() yield Request(url=firstload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load) secondload_url = mainpage_url + getweibopage.get_secondloadurl() #yield Request(url=secondload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load) thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
def get_follow(self,response): getweibopage = GetWeibopage() for page in range(WeiboSpider.follow_page_num,0,-1): GetWeibopage.followdata['Pl_Official_RelationMyfollow__108_page'] = page follow_url = getinfo.get_url(WeiboSpider.start_uid) + getweibopage.get_followurl() yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar']},callback=self.parse_follow)
class WeiboSpider(CrawlSpider): name = 'userfollow' allowed_domains = ['weibo.com', 'sina.com.cn'] settings = get_project_settings() start_username = settings['USER_NAME'] start_password = settings['PASS_WORD'] start_uid = settings['UID'] page_num = settings['PAGE_NUM'] follow_page_num = settings['FOLLOW_PAGE_NUM'] follower_page_num = settings['FOLLOWER_PAGE_NUM'] getweibopage = GetWeibopage() def __init__(self, uid=None): self.uid = uid def start_requests(self): username = WeiboSpider.start_username url = 'http://login.sina.com.cn/sso/prelogin.php?entry=sso&callback=sinaSSOController.preloginCallBack&su=%s&rsakt=mod&client=ssologin.js(v1.4.4)' % username return [Request(url=url, method='get', callback=self.post_requests)] def post_requests(self, response): serverdata = re.findall( '{"retcode":0,"servertime":(.*?),"pcid":.*?,"nonce":"(.*?)","pubkey":"(.*?)","rsakv":"(.*?)","exectime":.*}', response.body, re.I)[0] #获取get请求的数据,用于post请求登录 #print '!!!!GET responsebody:',response.body #print '!!!!serverdata',serverdata[0] servertime = serverdata[0] nonce = serverdata[1] pubkey = serverdata[2] rsakv = serverdata[3] username = WeiboSpider.start_username password = WeiboSpider.start_password formdata = { 'entry': 'weibo', 'gateway': '1', 'from': '', 'ssosimplelogin': '******', 'vsnf': '1', 'vsnval': '', 'su': getinfo.get_user(username), 'service': 'miniblog', 'servertime': servertime, 'nonce': nonce, 'pwencode': 'rsa2', 'sp': getinfo.get_pwd(password, servertime, nonce, pubkey), 'encoding': 'UTF-8', 'prelt': '115', 'rsakv': rsakv, 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype': 'META' } headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0 Chrome/20.0.1132.57 Safari/536.11' } return [ FormRequest( url= 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.4)', formdata=formdata, headers=headers, callback=self.get_cookie) ] def get_cookie(self, response): #print 'response:~~~~~~~~~~~~~~~',response.body p = re.compile('location\.replace\(\'(.*)\'\)') try: login_url = p.search(response.body).group(1) #print '==============',login_url ret_res = re.search('retcode=0', login_url) if ret_res: print 'Login Success!!!!' else: print 'Login Fail!!!!' except: print 'Login Error!!!!' request = response.request.replace( url=login_url, meta={'cookiejar': 1}, method='get', callback=self.get_relation_pagenum ) #GET请求login_url获取返回的cookie,后续发送Request携带此cookie return request def get_relation_pagenum(self, response): follow_url = 'http://weibo.com/%s/follow?page=1' % str(self.uid) follower_url = 'http://weibo.com/%s/fans?page=1' % str(self.uid) yield Request(url=follow_url, meta={ 'cookiejar': 1, 'uid': self.uid }, dont_filter=True, callback=self.parse_based_follownum) yield Request(url=follower_url, meta={ 'cookiejar': 1, 'uid': self.uid }, dont_filter=True, callback=self.parse_based_followernum) def parse_based_follownum(self, response): item = WeibospiderItem() analyzer = Analyzer() total_follow_pq = analyzer.get_childfollowhtml(response.body) follow_page_num = analyzer.get_relation_pagenum(total_follow_pq) if follow_page_num != "" and int(follow_page_num) >= 5: for page in range(5, 0, -1): GetWeibopage.relation_data['page'] = page follow_url = getinfo.get_follow_mainurl( response.meta['uid'] ) + WeiboSpider.getweibopage.get_relation_paramurl() yield Request(url=follow_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'] }, callback=self.parse_follow) elif follow_page_num == "": follow_url = 'http://weibo.com/%s/follow?page=1' % response.meta[ 'uid'] yield Request(url=follow_url, meta={ 'cookiejar': 1, 'uid': response.meta['uid'] }, callback=self.parse_follow) else: for page in range(int(follow_page_num), 0, -1): GetWeibopage.relation_data['page'] = page follow_url = getinfo.get_follow_mainurl( response.meta['uid'] ) + WeiboSpider.getweibopage.get_relation_paramurl() yield Request(url=follow_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'] }, callback=self.parse_follow) def parse_based_followernum(self, response): item = WeibospiderItem() analyzer = Analyzer() total_follower_pq = analyzer.get_followerhtml(response.body) follower_page_num = analyzer.get_relation_pagenum(total_follower_pq) if follower_page_num != "" and int(follower_page_num) >= 5: for page in range(5, 0, -1): GetWeibopage.relation_data['page'] = page follower_url = getinfo.get_follower_mainurl( response.meta['uid'] ) + WeiboSpider.getweibopage.get_relation_paramurl() yield Request(url=follower_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'] }, callback=self.parse_follower) elif follower_page_num == "": follower_url = 'http://weibo.com/%s/fans?page=1' % response.meta[ 'uid'] yield Request(url=follower_url, meta={ 'cookiejar': 1, 'uid': response.meta['uid'] }, callback=self.parse_follower) #yield None else: for page in range(int(follower_page_num), 0, -1): GetWeibopage.relation_data['page'] = page follower_url = getinfo.get_follower_mainurl( response.meta['uid'] ) + WeiboSpider.getweibopage.get_relation_paramurl() yield Request(url=follower_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': response.meta['uid'] }, callback=self.parse_follower) def parse_follow(self, response): item = WeibospiderItem() analyzer = Analyzer() total_follow_pq = analyzer.get_childfollowhtml(response.body) item['uid'] = response.meta['uid'] item['follow_uid_list'] = analyzer.get_childfollow(total_follow_pq) item['follower_uid_list'] = [] yield item #获取二级(关注)用户的关注和粉丝 if self.uid == response.meta['uid'] and len(item['follow_uid_list']): db = OracleStore() conn = db.get_connection() for follow_uid in item['follow_uid_list']: #获取关注用户的关注用户 sql1 = """select count(*) from t_user_follow where userID=%s""" % str( follow_uid) cursor1 = db.select_operation(conn, sql1) count1 = cursor1.fetchone() follow_scraped = count1[0] cursor1.close() if not follow_scraped: #scraped为0,即该账户没有获取过 follow_url = 'http://weibo.com/%s/follow?page=1' % str( follow_uid) yield Request(url=follow_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': follow_uid }, dont_filter=True, callback=self.parse_based_follownum) else: print 'follow_uid existed!', follow_uid yield None #获取关注用户的粉丝用户 sql2 = """select count(*) from t_user_follower where userID=%s""" % str( follow_uid) cursor2 = db.select_operation(conn, sql2) count2 = cursor2.fetchone() follower_scraped = count2[0] cursor2.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follower_url = 'http://weibo.com/%s/fans?page=1' % str( follow_uid) yield Request(url=follower_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': follow_uid }, dont_filter=True, callback=self.parse_based_followernum) else: print 'follower_uid existed!', follow_uid yield None conn.close() def parse_follower(self, response): item = WeibospiderItem() analyzer = Analyzer() getweibopage = GetWeibopage() total_follower_pq = analyzer.get_followerhtml(response.body) item['uid'] = response.meta['uid'] item['follower_uid_list'] = analyzer.get_follower(total_follower_pq) item['follow_uid_list'] = [] yield item #获取二级(粉丝)用户的关注和粉丝 if self.uid == response.meta['uid'] and len(item['follower_uid_list']): db = OracleStore() conn = db.get_connection() for follower_uid in item['follower_uid_list']: #获取粉丝用户的关注用户 sql1 = """select count(*) from t_user_follow where userID=%s""" % str( follower_uid) cursor1 = db.select_operation(conn, sql1) count1 = cursor1.fetchone() follower_scraped = count1[0] cursor1.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follow_url = 'http://weibo.com/%s/follow?page=1' % str( follower_uid) yield Request(url=follow_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': follower_uid }, dont_filter=True, callback=self.parse_based_follownum) else: print 'follow_uid existed!', follower_uid yield None #获取粉丝用户的粉丝用户 sql2 = """select count(*) from t_user_follower where userID=%s""" % str( follower_uid) cursor2 = db.select_operation(conn, sql2) count2 = cursor2.fetchone() follower_scraped = count2[0] cursor2.close() if not follower_scraped: #scraped为0,即该账户没有获取过 follower_url = 'http://weibo.com/%s/fans?page=1' % str( follower_uid) yield Request(url=follower_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': follower_uid }, dont_filter=True, callback=self.parse_based_followernum) else: print 'follower_uid existed!', follower_uid yield None conn.close()