Exemplo n.º 1
0
 def crawl_fans(self):
     def _crawl(parser, uid, page, num_pages='?'):
         msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page)
         write_message(msg, self.window)
         
         url  = 'http://weibo.cn/%s/fans?page=%s' %(uid, page)
         html = self._fetch(url)
         
         if html is None:
             return None
         
         try:
             pq_doc = pq(html)
             return parser.parse(pq_doc)
         except:
             return None
         
     msg = 'Checking: whether user(%s) exists or not...' %self.uid
     write_message(msg, self.window)
     is_exist= self.fetcher.check_user(self.uid)
     
     if is_exist is None:    #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         return None
     
     if not is_exist:
         msg = 'Not exist: %s.' %(self.uid)
         logger.info(msg)
         write_message(msg, self.window)
         
         return False
     
     self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path)
     
     start_time = time.time()
     
     parser = CnFansParser(self.storage)
     
     num_pages = _crawl(parser, self.uid, page=1)
     
     if num_pages is None:    #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         try:
             self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name)
         except:
             pass
         
         return None
     
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
         
         worker_manager.wait_all_complete()
         is_None = worker_manager.get_result()
         worker_manager.stop()
         
         if is_None:    #error occur
             msg = 'Error'
             logger.info(msg)
             write_message(msg, self.window)
             
             try:
                 self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name)
             except:
                 pass
         
             return None
         
     cost_time = int(time.time() - start_time)
     
     msg = ('Crawl user(%s)\'s fans: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window)
     
     return True
Exemplo n.º 2
0
 def crawl_msg_comments(self):
     def _crawl(parser, msg_id, page, num_pages='?'):
         msg = 'Crawl message(%s)\'s comments-page:%s:%s' %(msg_id, num_pages, page)
         write_message(msg, self.window)
     
         html, num_pages = self._fetch_msg_comment(msg_id, page)
         
         if html is None:
             return None
         
         try:
             pq_doc = pq(html)
             parser.parse(pq_doc)
             
             return num_pages
         except:
             return None
     
     msg = 'Checking: whether message exists or not...'
     write_message(msg, self.window)
     msg_id = self.fetcher.check_message(self.msg_url)
     
     if msg_id is None:      #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         return None
         
     if msg_id is False:
         msg = 'Not exist: %s.' %self.msg_url            
         logger.info(msg)
         write_message(msg, self.window)
         
         return False 
     
     self.msg_id = msg_id
     self.storage= FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path)
     
     start_time = time.time()
     
     parser = ComCommentsParser(msg_id, self.storage)
     num_pages = _crawl(parser, self.msg_id, 1)
     
     if num_pages is None:    #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         try:
             self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name)
         except:
             pass
         
         return None
     
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages)
         
         worker_manager.wait_all_complete()
         is_None = worker_manager.get_result()
         worker_manager.stop()
         
         if is_None:    #error occur
             msg = 'Error'
             logger.info(msg)
             write_message(msg, self.window)
             
             try:
                 self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name)
             except:
                 pass
                     
             return None
     
     cost_time = int(time.time() - start_time)
         
     msg = ('Crawl message(%s)\'s comments: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window)
     
     return True
# encoding: utf-8

from thread_pool import WorkerManager
# import sys
import time

def do_job(msg):
#     sys.stdout.write(msg)
#     print 'in do job:', msg
    return msg
    
    
if __name__ == '__main__':
    st = time.time()
    wm = WorkerManager(5, 5)
    
    wm.add_job(do_job, None)
    
    for i in range(1, 100):
        wm.add_job(do_job, i)
    
    wm.wait_all_complete()
    res = wm.get_result()
    wm.stop()
    print 'res:', res
    ed = time.time()
    
    print 'cost time: %s' %(ed - st)
Exemplo n.º 4
0
    def crawl_fans(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages,
                                                          page)
            write_message(msg, self.window)

            url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page)
            html = self._fetch(url)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None

        msg = 'Checking: whether user(%s) exists or not...' % self.uid
        write_message(msg, self.window)
        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if not is_exist:
            msg = 'Not exist: %s.' % (self.uid)
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.storage = FileStorage(self.uid, settings.MASK_FAN,
                                   self.store_path)

        start_time = time.time()

        parser = CnFansParser(self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.fans_fp,
                                    self.storage.fans_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.fans_fp,
                                        self.storage.fans_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl user(%s)\'s fans: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True
Exemplo n.º 5
0
    def crawl_msg_comments(self):
        def _crawl(parser, msg_id, page, num_pages='?'):
            msg = 'Crawl message(%s)\'s comments-page:%s:%s' % (
                msg_id, num_pages, page)
            write_message(msg, self.window)

            html, num_pages = self._fetch_msg_comment(msg_id, page)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)

                return num_pages
            except:
                return None

        msg = 'Checking: whether message exists or not...'
        write_message(msg, self.window)
        msg_id = self.fetcher.check_message(self.msg_url)

        if msg_id is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if msg_id is False:
            msg = 'Not exist: %s.' % self.msg_url
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.msg_id = msg_id
        self.storage = FileStorage(self.msg_id, settings.MASK_COMMENT,
                                   self.store_path)

        start_time = time.time()

        parser = ComCommentsParser(msg_id, self.storage)
        num_pages = _crawl(parser, self.msg_id, 1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.comments_fp,
                                    self.storage.comments_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.msg_id, pg,
                                       num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.comments_fp,
                                        self.storage.comments_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl message(%s)\'s comments: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True
# encoding: utf-8

from thread_pool import WorkerManager
# import sys
import time


def do_job(msg):
    #     sys.stdout.write(msg)
    #     print 'in do job:', msg
    return msg


if __name__ == '__main__':
    st = time.time()
    wm = WorkerManager(5, 5)

    wm.add_job(do_job, None)

    for i in range(1, 100):
        wm.add_job(do_job, i)

    wm.wait_all_complete()
    res = wm.get_result()
    wm.stop()
    print 'res:', res
    ed = time.time()

    print 'cost time: %s' % (ed - st)