def run(self):
        while self.window.keepRunning:
            page = self.thread_id
            msg = u"正在采集用户:%s 第  %s 页 / 共 %s 页 微博." % (self.user.get("sn"),
                                                       page, self.max_page)
            #print msg
            wx.CallAfter(self.window.WriteLog, msg)
            try:
                html = ""
                userid = self.user.get("_id")

                end_id = self.end_id
                eachpageCount = 0
                hasMore = 1
                max_id = ""
                crawler = self.msgcrawler
                #每页循环lazy load MAX:[0,1,2]3次
                while hasMore and eachpageCount <= 2:
                    rnd = getMillitime()
                    k_rnd = random.randint(10, 60)
                    if eachpageCount == 0:
                        url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=50&pre_page=%s&end_id=%s&_k=%s&_t=0&end_msign=-1&uid=%s&__rnd=%s" % (
                            page,
                            (page - 1), end_id, rnd + str(k_rnd), userid, rnd)
                    else:
                        #url 中 _k 为 时间戳(毫米计)
                        url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=15&pre_page=%s&end_id=%s&_k=%s&_t=0&max_id=%s&pagebar=%s&uid=%s&__rnd=%s" % (
                            page, (page), end_id, rnd + str(k_rnd + 1), max_id,
                            eachpageCount - 1, userid, getMillitime())
                    html = crawler.getAjaxmsg(url,
                                              "http://weibo.com/u/%s" % userid)
                    html = crawler.getHtmlFromJson(html)
                    hasMore, feedmsgLst, max_id = crawler.parseFeedlist(html)
                    #存入消息列表返回
                    self.msgLst.extend(feedmsgLst)
                    eachpageCount += 1
                self.result_queue.put(self.msgLst)
                self.window.finishedCount += 1
            except:
                s = sys.exc_info()
                msg = (u"parsePagelist Error %s happened on line %d" %
                       (s[1], s[2].tb_lineno))
                logger.error(msg)
            finally:
                wx.CallAfter(self.window.UpdateCrawlProcess,
                             self.window.finishedCount)
                break
예제 #2
0
 def parsePagelist(self, maxPage):
     url = ""
     userid = self.user.get("_id", "")
     max_id = ""
     end_id = ""
     html = ""
     msg = u"正在采集用户:%s 第  %s 页 / 共 %s 页 微博." % (self.user.get("sn"), 1, maxPage)
     #print msg
     wx.CallAfter(self.window.WriteLog, msg)
     try:
         eachpageCount = 0
         hasMore = 1
         #抽取第一页微博
         #每页循环lazy load MAX:[0,1,2]3次
         while hasMore and eachpageCount <= 2:
             rnd = getMillitime()
             k_rnd = random.randint(10, 60)
             page = 1
             if eachpageCount == 0:
                 url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=50&pre_page=%s&end_id=%s&_k=%s&_t=0&end_msign=-1&uid=%s&__rnd=%s" % (page, (page-1), end_id, rnd+str(k_rnd), userid, rnd)
             else:
                 #url 中 _k 为 时间戳(毫秒计)
                 url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=15&pre_page=%s&end_id=%s&_k=%s&_t=0&max_id=%s&pagebar=%s&uid=%s&__rnd=%s" % (page, (page), end_id, rnd+str(k_rnd+1), max_id, eachpageCount-1, userid,  getMillitime())
             if not html:
                 html = self.getAjaxmsg(url, "http://weibo.com/u/%s" % userid)
                 html = self.getHtmlFromJson(html)
             hasMore,feedmsgLst,max_id = self.parseFeedlist(html)
             if eachpageCount == 0:
                 end_id = feedmsgLst[0].get("mid", "0")
             #存入消息列表返回
             self.msgLst.extend(feedmsgLst)
             eachpageCount += 1
             html = ""
         
         self.window.totalCount += maxPage*3
         wx.CallAfter(self.window.SetCrawlProcessRange, (maxPage*1+self.window.processRangeVal))
         pool = workerpool.WorkerPool( self.thread_num )
         q = Queue(0)
         for i in range(2, maxPage+1):
             try:
                 #开启翻页采集线程
                 job = UsermsgJob(result_queue=q,thread_id=i,user=self.user,end_id=end_id,
                                  max_page=maxPage, msg_crawler=self,window=self.window)
                 pool.put(job)
             except:
                 s=sys.exc_info()
                 msg = (u"jobThread ERROR %s happened on line %d" % (s[1],s[2].tb_lineno))
                 logger.error(msg)
         pool.shutdown()
         pool.wait()
         try:
             for i in range(q.qsize()):
                 self.msgLst.extend(q.get(block=False))
         except Empty:
             pass # Success
     except:
         s=sys.exc_info()
         msg = (u"parsePagelist Error %s happened on line %d" % (s[1],s[2].tb_lineno))
         logger.error(msg)
예제 #3
0
 def run(self):
     while self.window.keepRunning :
         page = self.thread_id
         msg = u"正在采集用户:%s 第  %s 页 / 共 %s 页 微博." % (self.user.get("sn"), page, self.max_page)
         #print msg
         wx.CallAfter(self.window.WriteLog, msg)
         try:
             html = ""
             userid = self.user.get("_id")
             
             end_id = self.end_id
             eachpageCount = 0
             hasMore = 1
             max_id = ""
             crawler = self.msgcrawler
             #每页循环lazy load MAX:[0,1,2]3次
             while hasMore and eachpageCount <= 2:
                 rnd = getMillitime()
                 k_rnd = random.randint(10, 60)
                 if eachpageCount == 0:
                     url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=50&pre_page=%s&end_id=%s&_k=%s&_t=0&end_msign=-1&uid=%s&__rnd=%s" % (page, (page-1), end_id, rnd+str(k_rnd), userid, rnd)
                 else:
                     #url 中 _k 为 时间戳(毫米计)
                     url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=15&pre_page=%s&end_id=%s&_k=%s&_t=0&max_id=%s&pagebar=%s&uid=%s&__rnd=%s" % (page, (page), end_id, rnd+str(k_rnd+1), max_id, eachpageCount-1, userid,  getMillitime())
                 html = crawler.getAjaxmsg(url, "http://weibo.com/u/%s" % userid)
                 html = crawler.getHtmlFromJson(html)
                 hasMore,feedmsgLst,max_id = crawler.parseFeedlist(html)
                 #存入消息列表返回
                 self.msgLst.extend(feedmsgLst)
                 eachpageCount += 1
             self.result_queue.put(self.msgLst)
             self.window.finishedCount += 1
         except:
             s=sys.exc_info()
             msg = (u"parsePagelist Error %s happened on line %d" % (s[1],s[2].tb_lineno))
             logger.error(msg)
         finally:
             wx.CallAfter(self.window.UpdateCrawlProcess, self.window.finishedCount)
             break
    def parsePagelist(self, maxPage):
        url = ""
        userid = self.user.get("_id", "")
        max_id = ""
        end_id = ""
        html = ""
        msg = u"正在采集用户:%s 第  %s 页 / 共 %s 页 微博." % (self.user.get("sn"), 1,
                                                   maxPage)
        #print msg
        wx.CallAfter(self.window.WriteLog, msg)
        try:
            eachpageCount = 0
            hasMore = 1
            #抽取第一页微博
            #每页循环lazy load MAX:[0,1,2]3次
            while hasMore and eachpageCount <= 2:
                rnd = getMillitime()
                k_rnd = random.randint(10, 60)
                page = 1
                if eachpageCount == 0:
                    url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=50&pre_page=%s&end_id=%s&_k=%s&_t=0&end_msign=-1&uid=%s&__rnd=%s" % (
                        page,
                        (page - 1), end_id, rnd + str(k_rnd), userid, rnd)
                else:
                    #url 中 _k 为 时间戳(毫秒计)
                    url = "http://weibo.com/aj/mblog/mbloglist?_wv=5&page=%s&count=15&pre_page=%s&end_id=%s&_k=%s&_t=0&max_id=%s&pagebar=%s&uid=%s&__rnd=%s" % (
                        page, (page), end_id, rnd + str(k_rnd + 1), max_id,
                        eachpageCount - 1, userid, getMillitime())
                if not html:
                    html = self.getAjaxmsg(url,
                                           "http://weibo.com/u/%s" % userid)
                    html = self.getHtmlFromJson(html)
                hasMore, feedmsgLst, max_id = self.parseFeedlist(html)
                if eachpageCount == 0:
                    end_id = feedmsgLst[0].get("mid", "0")
                #存入消息列表返回
                self.msgLst.extend(feedmsgLst)
                eachpageCount += 1
                html = ""

            self.window.totalCount += maxPage * 3
            wx.CallAfter(self.window.SetCrawlProcessRange,
                         (maxPage * 1 + self.window.processRangeVal))
            pool = workerpool.WorkerPool(self.thread_num)
            q = Queue(0)
            for i in range(2, maxPage + 1):
                try:
                    #开启翻页采集线程
                    job = UsermsgJob(result_queue=q,
                                     thread_id=i,
                                     user=self.user,
                                     end_id=end_id,
                                     max_page=maxPage,
                                     msg_crawler=self,
                                     window=self.window)
                    pool.put(job)
                except:
                    s = sys.exc_info()
                    msg = (u"jobThread ERROR %s happened on line %d" %
                           (s[1], s[2].tb_lineno))
                    logger.error(msg)
            pool.shutdown()
            pool.wait()
            try:
                for i in range(q.qsize()):
                    self.msgLst.extend(q.get(block=False))
            except Empty:
                pass  # Success
        except:
            s = sys.exc_info()
            msg = (u"parsePagelist Error %s happened on line %d" %
                   (s[1], s[2].tb_lineno))
            logger.error(msg)