def run(self): while self.window.keepRunning: page = self.thread_id msg = u"正在采集用户:%s 第 %s 页 / 共 %s 页 微博." % (self.user.get("sn"), page, self.max_page) #print msg wx.CallAfter(self.window.WriteLog, msg) try: html = "" userid = self.user.get("_id") end_id = self.end_id eachpageCount = 0 hasMore = 1 max_id = "" crawler = self.msgcrawler #每页循环lazy load MAX:[0,1,2]3次 while hasMore and eachpageCount <= 2: rnd = getMillitime() k_rnd = random.randint(10, 60) if eachpageCount == 0: url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=50&pre_page=%s&end_id=%s&_k=%s&_t=0&end_msign=-1&uid=%s&__rnd=%s" % ( page, (page - 1), end_id, rnd + str(k_rnd), userid, rnd) else: #url 中 _k 为 时间戳(毫米计) url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=15&pre_page=%s&end_id=%s&_k=%s&_t=0&max_id=%s&pagebar=%s&uid=%s&__rnd=%s" % ( page, (page), end_id, rnd + str(k_rnd + 1), max_id, eachpageCount - 1, userid, getMillitime()) html = crawler.getAjaxmsg(url, "http://weibo.com/u/%s" % userid) html = crawler.getHtmlFromJson(html) hasMore, feedmsgLst, max_id = crawler.parseFeedlist(html) #存入消息列表返回 self.msgLst.extend(feedmsgLst) eachpageCount += 1 self.result_queue.put(self.msgLst) self.window.finishedCount += 1 except: s = sys.exc_info() msg = (u"parsePagelist Error %s happened on line %d" % (s[1], s[2].tb_lineno)) logger.error(msg) finally: wx.CallAfter(self.window.UpdateCrawlProcess, self.window.finishedCount) break
def parsePagelist(self, maxPage): url = "" userid = self.user.get("_id", "") max_id = "" end_id = "" html = "" msg = u"正在采集用户:%s 第 %s 页 / 共 %s 页 微博." % (self.user.get("sn"), 1, maxPage) #print msg wx.CallAfter(self.window.WriteLog, msg) try: eachpageCount = 0 hasMore = 1 #抽取第一页微博 #每页循环lazy load MAX:[0,1,2]3次 while hasMore and eachpageCount <= 2: rnd = getMillitime() k_rnd = random.randint(10, 60) page = 1 if eachpageCount == 0: url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=50&pre_page=%s&end_id=%s&_k=%s&_t=0&end_msign=-1&uid=%s&__rnd=%s" % (page, (page-1), end_id, rnd+str(k_rnd), userid, rnd) else: #url 中 _k 为 时间戳(毫秒计) url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=15&pre_page=%s&end_id=%s&_k=%s&_t=0&max_id=%s&pagebar=%s&uid=%s&__rnd=%s" % (page, (page), end_id, rnd+str(k_rnd+1), max_id, eachpageCount-1, userid, getMillitime()) if not html: html = self.getAjaxmsg(url, "http://weibo.com/u/%s" % userid) html = self.getHtmlFromJson(html) hasMore,feedmsgLst,max_id = self.parseFeedlist(html) if eachpageCount == 0: end_id = feedmsgLst[0].get("mid", "0") #存入消息列表返回 self.msgLst.extend(feedmsgLst) eachpageCount += 1 html = "" self.window.totalCount += maxPage*3 wx.CallAfter(self.window.SetCrawlProcessRange, (maxPage*1+self.window.processRangeVal)) pool = workerpool.WorkerPool( self.thread_num ) q = Queue(0) for i in range(2, maxPage+1): try: #开启翻页采集线程 job = UsermsgJob(result_queue=q,thread_id=i,user=self.user,end_id=end_id, max_page=maxPage, msg_crawler=self,window=self.window) pool.put(job) except: s=sys.exc_info() msg = (u"jobThread ERROR %s happened on line %d" % (s[1],s[2].tb_lineno)) logger.error(msg) pool.shutdown() pool.wait() try: for i in range(q.qsize()): self.msgLst.extend(q.get(block=False)) except Empty: pass # Success except: s=sys.exc_info() msg = (u"parsePagelist Error %s happened on line %d" % (s[1],s[2].tb_lineno)) logger.error(msg)
def run(self): while self.window.keepRunning : page = self.thread_id msg = u"正在采集用户:%s 第 %s 页 / 共 %s 页 微博." % (self.user.get("sn"), page, self.max_page) #print msg wx.CallAfter(self.window.WriteLog, msg) try: html = "" userid = self.user.get("_id") end_id = self.end_id eachpageCount = 0 hasMore = 1 max_id = "" crawler = self.msgcrawler #每页循环lazy load MAX:[0,1,2]3次 while hasMore and eachpageCount <= 2: rnd = getMillitime() k_rnd = random.randint(10, 60) if eachpageCount == 0: url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=50&pre_page=%s&end_id=%s&_k=%s&_t=0&end_msign=-1&uid=%s&__rnd=%s" % (page, (page-1), end_id, rnd+str(k_rnd), userid, rnd) else: #url 中 _k 为 时间戳(毫米计) url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=15&pre_page=%s&end_id=%s&_k=%s&_t=0&max_id=%s&pagebar=%s&uid=%s&__rnd=%s" % (page, (page), end_id, rnd+str(k_rnd+1), max_id, eachpageCount-1, userid, getMillitime()) html = crawler.getAjaxmsg(url, "http://weibo.com/u/%s" % userid) html = crawler.getHtmlFromJson(html) hasMore,feedmsgLst,max_id = crawler.parseFeedlist(html) #存入消息列表返回 self.msgLst.extend(feedmsgLst) eachpageCount += 1 self.result_queue.put(self.msgLst) self.window.finishedCount += 1 except: s=sys.exc_info() msg = (u"parsePagelist Error %s happened on line %d" % (s[1],s[2].tb_lineno)) logger.error(msg) finally: wx.CallAfter(self.window.UpdateCrawlProcess, self.window.finishedCount) break
def parsePagelist(self, maxPage): url = "" userid = self.user.get("_id", "") max_id = "" end_id = "" html = "" msg = u"正在采集用户:%s 第 %s 页 / 共 %s 页 微博." % (self.user.get("sn"), 1, maxPage) #print msg wx.CallAfter(self.window.WriteLog, msg) try: eachpageCount = 0 hasMore = 1 #抽取第一页微博 #每页循环lazy load MAX:[0,1,2]3次 while hasMore and eachpageCount <= 2: rnd = getMillitime() k_rnd = random.randint(10, 60) page = 1 if eachpageCount == 0: url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=50&pre_page=%s&end_id=%s&_k=%s&_t=0&end_msign=-1&uid=%s&__rnd=%s" % ( page, (page - 1), end_id, rnd + str(k_rnd), userid, rnd) else: #url 中 _k 为 时间戳(毫秒计) url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=15&pre_page=%s&end_id=%s&_k=%s&_t=0&max_id=%s&pagebar=%s&uid=%s&__rnd=%s" % ( page, (page), end_id, rnd + str(k_rnd + 1), max_id, eachpageCount - 1, userid, getMillitime()) if not html: html = self.getAjaxmsg(url, "http://weibo.com/u/%s" % userid) html = self.getHtmlFromJson(html) hasMore, feedmsgLst, max_id = self.parseFeedlist(html) if eachpageCount == 0: end_id = feedmsgLst[0].get("mid", "0") #存入消息列表返回 self.msgLst.extend(feedmsgLst) eachpageCount += 1 html = "" self.window.totalCount += maxPage * 3 wx.CallAfter(self.window.SetCrawlProcessRange, (maxPage * 1 + self.window.processRangeVal)) pool = workerpool.WorkerPool(self.thread_num) q = Queue(0) for i in range(2, maxPage + 1): try: #开启翻页采集线程 job = UsermsgJob(result_queue=q, thread_id=i, user=self.user, end_id=end_id, max_page=maxPage, msg_crawler=self, window=self.window) pool.put(job) except: s = sys.exc_info() msg = (u"jobThread ERROR %s happened on line %d" % (s[1], s[2].tb_lineno)) logger.error(msg) pool.shutdown() pool.wait() try: for i in range(q.qsize()): self.msgLst.extend(q.get(block=False)) except Empty: pass # Success except: s = sys.exc_info() msg = (u"parsePagelist Error %s happened on line %d" % (s[1], s[2].tb_lineno)) logger.error(msg)