def test_celery(): # g1 = group(add.si(2, 3)) # g2 = group([add.si(4, 4)]) # s1 = chord(g1, dummy.si()) # s2 = chord(g2, dummy.si()) # # func = chain([g1, g2]) | final_callback.s() # func = s1 | s2 | final_callback.s() # res = func() # print(res) data = [ '1', '2', '3', '4', ] * 1 info = { 'celery_worker': 'test.functional.test_celery_group_little.simple', 'worker': 'test.functional.test_celery_group_little.worker_do_sth', 'celery_max_workers': 1, 'celery_chunk_size': 2, 'chunk_size': 2, 'queue': 'a1', 'dummy': 'test.functional.test_celery.dummy', } # resp = work(data=data, worker = Worker(mode='celery') resp = worker.work(data, info) return resp
def test(self, data, info): print(data, '1111') print(info, '1111') sub_info = deepcopy(info) sub_info['chunk_size'] = info['sub_chunk_size'] sub_info['worker'] = info['sub_worker'] worker = Worker(mode='thread') resp = worker.work(data, sub_info)
def xtest_it(): print('>>>>') data = [ 'u11', 'u22', 'u33', 'u44', 'u21', 'u22', 'u23', 'u24', 'u31', 'u32', 'u33', 'u34', 'u41', 'u42', 'u43', 'u44', 'u51', 'u52', 'u53', 'u54', ] * 2 info = { 'worker': 'test.functional.test_worker_coroutine.worker_do_sth', 'chunk_size': 4, } worker = Worker(mode='coroutine') resp = worker.work(data, info) print(resp)
def xtest_celery(): # g1 = group(add.si(2, 3)) # g2 = group([add.si(4, 4)]) # s1 = chord(g1, dummy.si()) # s2 = chord(g2, dummy.si()) # # func = chain([g1, g2]) | final_callback.s() # func = s1 | s2 | final_callback.s() # res = func() # print(res) data = [ 'u11', 'u22', 'u33', 'u44', 'u21', 'u22', 'u23', 'u24', 'u31', 'u32', 'u33', 'u34', 'u41', 'u42', 'u43', 'u44', 'u51', 'u52', 'u53', 'u54', ]*1 info = { 'celery_worker': 'test.functional.test_celery.simple', 'worker': 'test.functional.test_celery.worker_do_sth', 'celery_max_workers': 1, 'celery_chunk_size': 40, 'chunk_size': 40, # 'final_callback': 'test.functional.test_celery.final_callback_si', 'dummy': 'test.functional.test_celery.dummy', 'sync_callback': 'test.functional.test_celery.final_callback', 'each_callback': 'test.functional.test_celery.callback', # 'queue': 'worker' } # resp = work(data=data, worker = Worker(mode='celery') resp = worker.work(data, info) return resp
'netboy.celery.tasks.pycurl_worker', 'worker': 'netboy.celery.tasks.multicurl_worker_do_crawl', 'celery_max_workers': 4, 'celery_chunk_size': 10, 'chunk_size': 5, 'queue': 'worker', 'dummy': 'netboy.celery.tasks.dummy', 'filter': ['url', 'cookielist'], 'triggers': [ { 'hello': 'world' }, { 'hello2': 'world2' }, { 'trigger': 'netboy.support.trigger.trig_it' }, ], 'analysers': ['netboy.support.analysers.analyse_it'] } worker = Worker(mode='celery') resp = worker.work(data, info) print(resp)
def work_do_sth(data, info): for i in data: get_picture(i) if __name__ == "__main__": # a = get_picture() # set(a) a = time.time() data = [1, 2, 3, 4, 5, 6] info = {'worker': 'pachong.work_do_sth', 'chunk_size': 2} ww = Worker(mode="thread") resp = ww.work(data, info) # 直接爬取 # for i in data: # get_picture(i) # for i in list(range(1,7)): # th1 = threading.Thread(target=get_picture, args=(i,)) # th1.start() # th1 = threading.Thread(target=get_picture, args=(1,)) # # th1.start() # th2 = threading.Thread(target=get_picture, args=(2,)) # th2.start() # th1.join()
class NetBoy: def __init__(self, info=None): self.info = info if info else {} self.info['dummy'] = 'netboy.celery.tasks.dummy' self.info['log'] = 'netboy' def use_socks5_proxy(self, proxy): p = proxy.split(':') self.info['proxytype'] = 'socks5' self.info['proxy'] = p[0] self.info['proxyport'] = int(p[1]) return self def use_http_proxy(self, proxy): p = proxy.split(':') self.info['proxytype'] = 'http' self.info['proxy'] = p[0] self.info['proxyport'] = int(p[1]) return self def use_queue(self, queue): self.info['queue'] = queue return self def use_logger(self, log_name): self.info['log'] = log_name return self def use_filter(self, result_filter): self.info['filter'] = result_filter return self def use_prepares(self, prepares): self.info['prepares'] = prepares return self def use_triggers(self, triggers): self.info['triggers'] = triggers return self def use_analysers(self, analysers): self.info['analysers'] = analysers return self def use_auth(self, user, password, group='default'): self.info['auth'] = { 'user': user, 'password': password, 'group': group } return self def use_useragent(self, useragent): self.info['useragent'] = useragent return self def use_timeout(self, timeout=None, connect=None, wait=None, script=None): if timeout: self.info['timeout'] = timeout if connect: self.info['connecttimeout'] = connect if wait: self.info['wait'] = wait if script: self.info['script_timeout'] = script return self def use_info(self, info): self.info = info return self def use_final(self, final): self.info['final'] = final return self def use_mode(self, mode): self.info['mode'] = mode spider = self.info.get('spider') if spider: self.use_spider(spider) else: if mode == 'coroutine': self.use_spider('aiohttp') return self def use_spider(self, spider='pycurl'): self.info['spider'] = spider mode = self.info.get('mode', 'thread') if mode == 'celery': if spider == 'pycurl': self.info[ 'celery_worker'] = 'netboy.celery.tasks.pycurl_worker' self.info[ 'worker'] = 'netboy.celery.tasks.multicurl_worker_do_crawl' self.info[ 'final_callback'] = 'netboy.celery.tasks.final_callback' elif spider == 'chrome': self.info[ 'celery_worker'] = 'netboy.celery.tasks.thread_worker' self.info[ 'worker'] = 'netboy.celery.tasks.chrome_worker_do_crawl' self.info[ 'final_callback'] = 'netboy.celery.tasks.final_callback' elif mode == 'coroutine' and spider == 'aiohttp': self.info[ 'worker'] = 'netboy.aio_http.aiohttp_handler.aiohttp_handler' else: if spider == 'pycurl': self.info[ 'worker'] = 'netboy.multi_pycurl.multicurl_handler.curl_handler' elif spider == 'chrome': self.info[ 'worker'] = 'netboy.selenium_chrome.chrome_driver_handler.chrome_driver_handler' elif spider == 'aiohttp': self.info[ 'worker'] = 'netboy.aio_http.aiohttp_handler.aio_http_handler' return self def use_workers(self, workers=8, chunk_size1=40, chunk_size2=8): self.info['celery_max_workers'] = workers self.info['max_workers'] = workers self.info['celery_chunk_size'] = max(chunk_size1, chunk_size2) self.info['chunk_size'] = min(chunk_size1, chunk_size2) return self def use_logger(self, logger): self.info['log'] = logger return self def use_chrome(self, chrome): self.info['chrome'] = chrome self.use_spider('chrome') return self def use_window(self, window): self.info['window_size'] = window self.use_spider('chrome') return self def run(self, data): self.worker = Worker(mode=self.info.get('mode', 'thread')) resp = self.worker.work(data, self.info) return resp def run_remote(self, url, data, callback_data=None): self.worker = Worker(mode=self.info.get('mode', 'thread')) triggers = self.info.get('triggers') trigger_payload = {'trigger': 'netboy.support.triggers.post_it'} if callback_data: trigger_payload.update(callback_data) if triggers: self.info['triggers'].append(trigger_payload) else: self.info['triggers'] = [trigger_payload] payload = { 'url': url, 'method': 'post', 'postfields': { 'info': copy(self.info), 'data': data } } resp = curl_work(payload, logger='netboy') return resp def register_remote(self, url, user, password, group='default'): payload = { 'url': url, 'method': 'post', 'postfields': { 'user': user, 'password': password, 'group': group } } resp = curl_work(payload, logger='netboy') return resp