def crawl_follows(self): def _crawl(parser, uid, page, num_pages=''): msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.com/%s/follow?page=%s' %(uid, page) html = self._fetch(url, query=settings.QUERY_FOLLOWS) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %(self.uid) logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path) start_time = time.time() parser = ComFollowsParser(self.storage, uids_storage=self.uids_storage) num_pages = _crawl(parser, self.uid, page=1) if settings.PAGE_LIMIT != 0: if num_pages > settings.PAGE_LIMIT: msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT) write_message(msg, self.window) num_pages = settings.PAGE_LIMIT pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s follows: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def test(): import socket socket.setdefaulttimeout(10) print 'start testing' wm = WorkerManager(3) for i in range(1,11): wm.add_job( test_job, i, i*0.001 ) wm.wait_for_complete()
def crawl_weibos(self): def _crawl(parser, uid, page, num_pages=""): msg = "Crawl user(%s)'s weibos-page: %s:%s" % (self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = "Checking: whether user(%s) exists or not..." % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = "Not exist: %s." % self.uid logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage) num_pages = _crawl(parser, self.uid, page=1) pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = "Crawl user(%s)'s weibos: total page=%s," " cost time=%s sec, connections=%s" % ( self.uid, num_pages, cost_time, self.fetcher.n_connections, ) logger.info(msg) write_message(msg, self.window)
def crawl_msg_reposts(self): def _crawl(parser, msg_id, page, num_pages=""): msg = "Crawl message(%s)'s reposts-page:%s:%s" % (self.msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_repost(msg_id, page) try: pq_doc = pq(html) parser.parse(pq_doc) except: pass return num_pages msg = "Checking: whether message exists or not..." write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: msg = "Not exist: %s." % self.msg_url logger.info(msg) write_message(msg, self.window) return self.msg_id = msg_id self.storage = FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path) start_time = time.time() parser = ComRepostsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = "Crawl message(%s)'s reposts: total page=%s," " cost time=%s sec, connections=%s" % ( self.msg_id, num_pages, cost_time, self.fetcher.n_connections, ) logger.info(msg) write_message(msg, self.window)
def crawl_weibos(self): def _crawl(parser, uid, page, num_pages=''): msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage) num_pages = _crawl(parser, self.uid, page=1) pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s weibos: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def crawl_msg_reposts(self): def _crawl(parser, msg_id, page, num_pages=''): msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_repost(msg_id, page) try: pq_doc = pq(html) parser.parse(pq_doc) except: pass return num_pages msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: msg = 'Not exist: %s.' %self.msg_url logger.info(msg) write_message(msg, self.window) return self.msg_id = msg_id self.storage= FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path) start_time = time.time() parser = ComRepostsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s reposts: total page=%s,' ' cost time=%s sec, connections=%s' %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def crawl_fans(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/fans?page=%s' %(uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' %(self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path) start_time = time.time() parser = CnFansParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s fans: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
def crawl_msg_comments(self): def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s comments-page:%s:%s' %(msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_comment(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if msg_id is False: msg = 'Not exist: %s.' %self.msg_url logger.info(msg) write_message(msg, self.window) return False self.msg_id = msg_id self.storage= FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path) start_time = time.time() parser = ComCommentsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s comments: total page=%s,' ' cost time=%s sec, connections=%s' %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
# encoding: utf-8 from thread_pool import WorkerManager # import sys import time def do_job(msg): # sys.stdout.write(msg) # print 'in do job:', msg return msg if __name__ == '__main__': st = time.time() wm = WorkerManager(5, 5) wm.add_job(do_job, None) for i in range(1, 100): wm.add_job(do_job, i) wm.wait_all_complete() res = wm.get_result() wm.stop() print 'res:', res ed = time.time() print 'cost time: %s' %(ed - st)
uids_storage=follows) sw.main(fetcher, fetch_data='fans', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=fans) friends_list = list(set(fans) | set(follows)) print friends_list #host's weibo sw.main(fetcher, fetch_data='weibos', store_path='./file/', uids=memstorage.users_id_moniterd) #friends' weibo n_threads = 10 n_paritions = 10 len_partition = len(friends_list) / n_paritions worker_manager = WorkerManager(n_threads) for i in range(0, len(friends_list), len_partition): worker_manager.add_job( sw.main, fetcher, fetch_data='weibos', store_path='./file/', uids=friends_list[i:min(i + len_partition, len(friends_list))]) worker_manager.wait_all_complete()
fetcher = ComWeiboFetcher(username=account.user, password=account.pwd) login_ok = fetcher.check_cookie() if not login_ok: print 'login failed.' sys.exit() fans = [] follows = [] sw.main(fetcher, fetch_data='follows', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=follows) sw.main(fetcher, fetch_data='fans', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=fans) friends_list = list(set(fans)|set(follows)) print friends_list #host's weibo sw.main(fetcher,fetch_data='weibos',store_path='./file/',uids=memstorage.users_id_moniterd) #friends' weibo n_threads = 10 n_paritions = 10 len_partition = len(friends_list)/n_paritions worker_manager = WorkerManager(n_threads) for i in range(0,len(friends_list),len_partition): worker_manager.add_job(sw.main, fetcher, fetch_data='weibos',store_path='./file/', uids=friends_list[i:min(i+len_partition,len(friends_list))] ) worker_manager.wait_all_complete()
def crawl_fans(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' % (self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path) start_time = time.time() parser = CnFansParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s fans: total page=%s,' ' cost time=%s sec, connections=%s' % (self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
def crawl_msg_comments(self): def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s comments-page:%s:%s' % ( msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_comment(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if msg_id is False: msg = 'Not exist: %s.' % self.msg_url logger.info(msg) write_message(msg, self.window) return False self.msg_id = msg_id self.storage = FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path) start_time = time.time() parser = ComCommentsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s comments: total page=%s,' ' cost time=%s sec, connections=%s' % (self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
# encoding: utf-8 from thread_pool import WorkerManager # import sys import time def do_job(msg): # sys.stdout.write(msg) # print 'in do job:', msg return msg if __name__ == '__main__': st = time.time() wm = WorkerManager(5, 5) wm.add_job(do_job, None) for i in range(1, 100): wm.add_job(do_job, i) wm.wait_all_complete() res = wm.get_result() wm.stop() print 'res:', res ed = time.time() print 'cost time: %s' % (ed - st)