def get_userinfo(self, response): db = OracleStore() conn = db.get_connection() sql1 = "select * from t_user_info where imagestate = 0" cursor1 = db.select_operation(conn, sql1) sql2 = "select count(*) from t_user_info where imagestate = 0" cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() for i in range(count[0]): for result in cursor1.fetchmany(1): if result[0]: mainpageurl = 'http://weibo.com/u/' + str( result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] #result[1] getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.get_userurl)
def start_getweiboinfo(self, response): mainpageurl = 'http://weibo.com/u/' + str( self.uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid getweibopage = GetWeibopage() for page in range(int(self.per_page_num), int(self.per_page_num) + 2): GetWeibopage.data['page'] = page firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load)
def parse_total_page(self, response): analyzer = Analyzer() total_pq = analyzer.get_html(response.body, 'script:contains("W_pages")') friendcircle_analyzer = keyword_info_analyzer() total_pages = friendcircle_analyzer.get_totalpages( total_pq) #需要爬取的微博朋友圈页数 logger.info("the total_pages is: %d", total_pages) getweibopage = GetWeibopage() mainpage_url = response.meta['mainpage_url'] user_id = response.meta['uid'] is_search = response.meta['is_search'] for page in range(1): #TODO 此处要更改为total_pages GetWeibopage.data['uid'] = user_id GetWeibopage.data['page'] = page + 1 firstload_url = mainpage_url + getweibopage.get_firstloadurl() yield Request(url=firstload_url, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': user_id, 'is_search': is_search }, callback=self.parse_load) secondload_url = mainpage_url + getweibopage.get_secondloadurl() #yield Request(url=secondload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load) thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
def start_getweiboinfo(self,response): # db = OracleStore() # conn = db.get_connection() # sql1 = '''select * from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor1 = db.select_operation(conn,sql1) # # sql2 = '''select count(*) from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor2 = db.select_operation(conn,sql2) # count = cursor2.fetchone() # # if count[0]: # for i in range(1): #(count[0]): # for result in cursor1.fetchmany(1): # if result[0]: mainpageurl = 'http://weibo.com/u/'+str(self.uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid #result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)
def start_getweiboinfo(self,response): db = MysqlStore() conn = db.get_connection() sql1 = "select * from t_user_follow where contentstate = 0" cursor1 = db.select_operation(conn,sql1) sql2 = "select count(*) from t_user_follow where contentstate = 0" cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() for i in range(10): #(count[0]): #count[0]为需要再爬取的用户数 for result in cursor1.fetchmany(1): if result[1]: mainpageurl = 'http://weibo.com/u/'+str(result[1])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[1] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_firstload) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_secondload) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_thirdload)
def start_getweiboinfo(self,response): db = OracleStore();conn = db.get_connection() sql1 = '''select * from t_user_info''' cursor1 = db.select_operation(conn,sql1) sql2 = '''select count(*) from t_user_info''' cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() if count[0]: for i in range(count[0]): for result in cursor1.fetchmany(1): mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load) else: yield None db.close_connection(conn,cursor1,cursor2)
def get_userinfo(self,response): #db = OracleStore();conn = db.get_connection() for uid in self.uid_list: #sql = "select count(*) from (select userID from t_user_info where userID='%s' union select userID from t_publicuser_info where userID='%s')" % (uid,uid) #cursor = db.select_operation(conn,sql);count = cursor.fetchone() #if not count[0]: #没有爬取过该uid用户 print "!!scraping each uid:",uid mainpageurl = 'http://weibo.com/u/'+str(uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = uid getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':uid},callback=self.get_userurl)
def start_getweiboinfo(self, response): db = OracleStore() conn = db.get_connection() sql1 = '''select * from t_user_info''' cursor1 = db.select_operation(conn, sql1) sql2 = '''select count(*) from t_user_info''' cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() if count[0]: for i in range(count[0]): for result in cursor1.fetchmany(1): mainpageurl = 'http://weibo.com/u/' + str( result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl( ) yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl( ) yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[0] }, callback=self.parse_load) else: yield None db.close_connection(conn, cursor1, cursor2)
def start_getweiboinfo(self,response): mainpageurl = 'http://weibo.com/u/'+str(self.uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)
def start_getweiboinfo(self, response): db = MysqlStore() conn = db.get_connection() sql1 = "select * from t_user_follow where contentstate = 0" cursor1 = db.select_operation(conn, sql1) sql2 = "select count(*) from t_user_follow where contentstate = 0" cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() for i in (count[0]): #count[0]为需要再爬取的用户数 for result in cursor1.fetchmany(1): if result[1]: mainpageurl = 'http://weibo.com/u/' + str( result[1]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[1] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_firstload) secondloadurl = mainpageurl + getweibopage.get_secondloadurl( ) yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_secondload) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl( ) yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_thirdload)
def get_userinfo(self, response): #db = OracleStore();conn = db.get_connection() for uid in self.uid_list: #sql = "select count(*) from (select userID from t_user_info where userID='%s' union select userID from t_publicuser_info where userID='%s')" % (uid,uid) #cursor = db.select_operation(conn,sql);count = cursor.fetchone() #if not count[0]: #没有爬取过该uid用户 print "!!scraping each uid:", uid mainpageurl = 'http://weibo.com/u/' + str( uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = uid getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': uid }, callback=self.get_userurl)
def start_getweiboinfo(self, response): # db = OracleStore() # conn = db.get_connection() # sql1 = '''select * from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor1 = db.select_operation(conn,sql1) # # sql2 = '''select count(*) from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) # cursor2 = db.select_operation(conn,sql2) # count = cursor2.fetchone() # # if count[0]: # for i in range(1): #(count[0]): # for result in cursor1.fetchmany(1): # if result[0]: mainpageurl = 'http://weibo.com/u/' + str( self.uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&' GetWeibopage.data['uid'] = self.uid #result[0] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': self.uid }, callback=self.parse_load)
def get_userinfo(self,response): db = OracleStore() conn = db.get_connection() sql1 = "select * from t_user_info where imagestate = 0" cursor1 = db.select_operation(conn,sql1) sql2 = "select count(*) from t_user_info where imagestate = 0" cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() for i in range(count[0]): for result in cursor1.fetchmany(1): if result[0]: mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] #result[1] getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.get_userurl)
def parse_total_page(self,response): analyzer = Analyzer() total_pq = analyzer.get_html(response.body,'script:contains("W_pages")') friendcircle_analyzer = keyword_info_analyzer() total_pages = friendcircle_analyzer.get_totalpages(total_pq) #需要爬取的微博朋友圈页数 logger.info("the total_pages is: %d",total_pages) getweibopage = GetWeibopage() mainpage_url = response.meta['mainpage_url'] user_id = response.meta['uid'] is_search = response.meta['is_search'] for page in range(1): #TODO 此处要更改为total_pages GetWeibopage.data['uid'] = user_id GetWeibopage.data['page'] = page + 1 firstload_url = mainpage_url + getweibopage.get_firstloadurl() yield Request(url=firstload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load) secondload_url = mainpage_url + getweibopage.get_secondloadurl() #yield Request(url=secondload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load) thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()