def start_getweiboinfo(self, response): mainpageurl = 'http://weibo.com/u/' + str( self.uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid getweibopage = GetWeibopage() for page in range(int(self.per_page_num), int(self.per_page_num) + 2): GetWeibopage.data['page'] = page firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load)
def start_getweiboinfo(self,response): # db = OracleStore() # conn = db.get_connection() # sql1 = '''select * from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor1 = db.select_operation(conn,sql1) # # sql2 = '''select count(*) from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor2 = db.select_operation(conn,sql2) # count = cursor2.fetchone() # # if count[0]: # for i in range(1): #(count[0]): # for result in cursor1.fetchmany(1): # if result[0]: mainpageurl = 'http://weibo.com/u/'+str(self.uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid #result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)
def parse_total_page(self, response): analyzer = Analyzer() total_pq = analyzer.get_html(response.body, 'script:contains("W_pages")') friendcircle_analyzer = keyword_info_analyzer() total_pages = friendcircle_analyzer.get_totalpages( total_pq) #需要爬取的微博朋友圈页数 logger.info("the total_pages is: %d", total_pages) getweibopage = GetWeibopage() mainpage_url = response.meta['mainpage_url'] user_id = response.meta['uid'] is_search = response.meta['is_search'] for page in range(1): #TODO 此处要更改为total_pages GetWeibopage.data['uid'] = user_id GetWeibopage.data['page'] = page + 1 firstload_url = mainpage_url + getweibopage.get_firstloadurl() yield Request(url=firstload_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': user_id, 'is_search': is_search }, callback=self.parse_load) secondload_url = mainpage_url + getweibopage.get_secondloadurl() #yield Request(url=secondload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load) thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
def start_getweiboinfo(self,response): db = MysqlStore() conn = db.get_connection() sql1 = "select * from t_user_follow where contentstate = 0" cursor1 = db.select_operation(conn,sql1) sql2 = "select count(*) from t_user_follow where contentstate = 0" cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() for i in range(10): #(count[0]): #count[0]为需要再爬取的用户数 for result in cursor1.fetchmany(1): if result[1]: mainpageurl = 'http://weibo.com/u/'+str(result[1])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[1] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_firstload) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_secondload) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_thirdload)
def start_getweiboinfo(self,response): db = OracleStore();conn = db.get_connection() sql1 = '''select * from t_user_info''' cursor1 = db.select_operation(conn,sql1) sql2 = '''select count(*) from t_user_info''' cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() if count[0]: for i in range(count[0]): for result in cursor1.fetchmany(1): mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) else: yield None db.close_connection(conn,cursor1,cursor2)
def get_userinfo(self,response): mainpageurl = 'http://weibo.com/u/'+str(1227086635)+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = 1227086635 getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 #firstloadurl = mainpageurl + getweibopage.get_firstloadurl() thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar']},callback=self.get_userurl)
def start_getweibo_info(self,response): db = MysqlStore(); #取出没有爬取过的且is_delete=0的重点人员 GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage() for round in range(1): #遍历数据库的轮数 conn = db.get_connection() sql1 = "select user_id from cauc_black_man_test a \ where a.is_search = 0 and a.is_delete = 0" cursor = db.select_operation(conn,sql1) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the unsearched user_id:%s",user_id) #更新is_search标志位为1 sql2 = "update cauc_black_man_test set is_search = 1 where user_id = '%s'" % user_id db.update_operation(conn,sql2) #获取需要爬取的总页面数 start_time = self.start_time;end_time = get_current_time('hour') mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page) logger.info("current timestamp:%d",int(time.time())) #设置循环爬取间隔 time.sleep(WeiboSpider.settings['FRIENDCIRCAL_INTERVAL']) #可以采用间隔15min #取出已经爬取过is_search=1的且is_delete=0的重点人员 sql3 = "select user_id from cauc_black_man_test a \ where a.is_search = 1 and a.is_delete = 0" cursor = db.select_operation(conn,sql3) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the searched user_id:%s",user_id) start_time = get_time_by_interval(int(time.time()),86400,'hour');end_time = get_current_time('hour') #起始和结束间隔时间为1天(86400s) mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() #yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page) conn.close()
def start_getweibo_info(self,response): db = MysqlStore(); #取出没有爬取过的且is_delete=0的重点人员 GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage() for round in range(1): #遍历数据库的轮数 conn = db.get_connection() sql1 = "select user_id from cauc_warning_man_test a \ where a.is_search = 0 and a.is_delete = 0" cursor = db.select_operation(conn,sql1) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the unsearched user_id:%s",user_id) #更新is_search标志位为1 sql2 = "update cauc_warning_man_test set is_search = 1 where user_id = '%s'" % user_id db.update_operation(conn,sql2) #获取需要爬取的总页面数 start_time = self.start_time;end_time = get_current_time('hour') mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page) logger.info("current timestamp:%d",int(time.time())) #设置循环爬取间隔 time.sleep(WeiboSpider.settings['WEIBOCONTENT_INTERVAL']) #可以采用间隔15min #取出已经爬取过is_search=1的且is_delete=0的预警人员 sql3 = "select user_id from cauc_warning_man_test a \ where a.is_search = 1 and a.is_delete = 0" cursor = db.select_operation(conn,sql3) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the searched user_id:%s",user_id) start_time = get_time_by_interval(int(time.time()),86400,'hour');end_time = get_current_time('hour') #起始和结束间隔时间为1天(86400s) mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() #yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page) conn.close()
def start_getweibo_info(self,response): db = MysqlStore(); #取出没有爬取过的且is_delete=0的重点人员 GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage() #for round in range(1): #遍历数据库的轮数 conn = db.get_connection() sql1 = "select user_id from cauc_warning_man a \ where a.is_search = 0 and a.is_delete = 0" cursor1 = db.select_operation(conn,sql1) for user_id in cursor1.fetchall(): user_id = user_id[0] logger.info("this is the unsearched user_id:%s",user_id) #获取需要爬取的总页面数 start_time = self.start_time;end_time = get_current_time('day') mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,cookies=random.choice(COOKIES),meta={'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page) #取出已经爬取过is_search=1的且is_delete=0的预警人员 sql2 = "select user_id from cauc_warning_man a \ where a.is_search = 1 and a.is_delete = 0" cursor2 = db.select_operation(conn,sql2) for user_id in cursor2.fetchall(): user_id = user_id[0] logger.info("this is the searched user_id:%s",user_id) #start_time = get_time_by_interval(int(time.time()),86400,'day');end_time = get_current_time('day') #起始和结束间隔时间为1天(86400s),即过去一天的内容 start_time = get_time_by_interval(int(time.time()),int(self.interval),'day');end_time = get_current_time('day') #起始和结束间隔时间为x天(由interval代表的秒换算而来) mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,cookies=random.choice(COOKIES),meta={'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page) #更新is_search标志位为1 sql3 = "update cauc_warning_man set is_search = 1 where is_search = 0 and is_delete = 0" db.update_operation(conn,sql3) db.close_connection(conn)
def start_getweiboinfo(self, response): db = OracleStore() conn = db.get_connection() sql1 = '''select * from t_user_info''' cursor1 = db.select_operation(conn, sql1) sql2 = '''select count(*) from t_user_info''' cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() if count[0]: for i in range(count[0]): for result in cursor1.fetchmany(1): mainpageurl = 'http://weibo.com/u/' + str( result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl( ) yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl( ) yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) else: yield None db.close_connection(conn, cursor1, cursor2)
def start_getweiboinfo(self,response): mainpageurl = 'http://weibo.com/u/'+str(self.uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)
def parse_userinfo(self,response): item = response.meta['item'] #f=open('./text2.html','w') #f.write(response.body) analyzer = Analyzer() total_pq = analyzer.get_html(response.body,'script:contains("PCD_text_b")') #userinfo_dict = analyzer.get_userinfo(total_pq) item['userinfo'] = analyzer.get_userinfo(total_pq) #uid = item['uid'] mainpageurl = 'http://weibo.com/u/'+str(response.meta['uid'])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = response.meta['uid'] #uid getweibopage = GetWeibopage() GetWeibopage.data['page'] = WeiboSpider.page_num-1 thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':response.meta['uid'],'followlist':response.meta['followlist']},callback=self.parse_thirdload)
def parse_follow(self,response): #print '************************ source request url:',response.request.url item = WeibospiderItem() analyzer = Analyzer() total_pq = analyzer.get_followhtml(response.body) #item['followuidlist'] = analyzer.get_follow(total_pq) followlist = analyzer.get_follow(total_pq) #item['userinfo'] = {} oldflag,stopflag= getinfo.get_followflag(WeiboSpider.filename) p = re.compile('.*_page=(\d).*',re.S) current_page = p.search(response.request.url).group(1) #获取当前关注用户列表页页数 if int(current_page) == 1: getinfo.set_followflag(WeiboSpider.filename,followlist[0],'False') print 'page is equal 1 ' else: print 'page is NOT equal 1' for follow_uid in followlist[:2]: print '%%%%%%%%%%%%%%%%%%%%%%%%%%',follow_uid #item['uid'] = follow_uid if follow_uid != oldflag: #对于已爬uid不进行重复爬取,即增量爬取 #爬取该uid用户主页微博内容 if stopflag == 'False': getinfo.set_followflag(WeiboSpider.filename,followlist[0],'True') mainpageurl = 'http://weibo.com/u/'+str(follow_uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = follow_uid getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 #当页第一次加载 #当页第二次加载 #当页第三次加载 thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() if int(GetWeibopage.data['pagebar']) == 1 and page == WeiboSpider.page_num-1: #在最后一页最后一次加载时,获取用户基本信息 print 'hhhhhhhhhhhhhhhhhhhh',followlist yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid,'followlist':followlist},callback=self.get_userurl) #continue #yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid},callback=self.parse_thirdload) #firstloadurl = mainpageurl + getweibopage.get_firstloadurl() #yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid},callback=self.parse_firstload) else: break else: break
def start_getweiboinfo(self, response): db = MysqlStore() conn = db.get_connection() sql1 = "select * from t_user_follow where contentstate = 0" cursor1 = db.select_operation(conn, sql1) sql2 = "select count(*) from t_user_follow where contentstate = 0" cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() for i in (count[0]): #count[0]为需要再爬取的用户数 for result in cursor1.fetchmany(1): if result[1]: mainpageurl = 'http://weibo.com/u/' + str( result[1]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[1] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_firstload) secondloadurl = mainpageurl + getweibopage.get_secondloadurl( ) yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_secondload) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl( ) yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_thirdload)
def start_getweiboinfo(self, response): # db = OracleStore() # conn = db.get_connection() # sql1 = '''select * from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor1 = db.select_operation(conn,sql1) # # sql2 = '''select count(*) from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor2 = db.select_operation(conn,sql2) # count = cursor2.fetchone() # # if count[0]: # for i in range(1): #(count[0]): # for result in cursor1.fetchmany(1): # if result[0]: mainpageurl = 'http://weibo.com/u/' + str( self.uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid #result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load)
def parse_total_page(self,response): analyzer = Analyzer() total_pq = analyzer.get_html(response.body,'script:contains("W_pages")') friendcircle_analyzer = keyword_info_analyzer() total_pages = friendcircle_analyzer.get_totalpages(total_pq) #需要爬取的微博朋友圈页数 logger.info("the total_pages is: %d",total_pages) getweibopage = GetWeibopage() mainpage_url = response.meta['mainpage_url'] user_id = response.meta['uid'] is_search = response.meta['is_search'] for page in range(1): #TODO 此处要更改为total_pages GetWeibopage.data['uid'] = user_id GetWeibopage.data['page'] = page + 1 firstload_url = mainpage_url + getweibopage.get_firstloadurl() yield Request(url=firstload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load) secondload_url = mainpage_url + getweibopage.get_secondloadurl() #yield Request(url=secondload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load) thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()