def gtaskManager(self,urls,extractSearchResults,proxy_flag = 0,ua_flag = 0): task_log = None gtaskpool.setlogging(logging.INFO,task_log) purl1 = ["http://192.168.120.17:8014/proxy/get_http_proxy_list"] uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list" limited_urls = [ ('^https{0,1}://', 0) ] global proxymgr if proxy_flag == 1: proxymgr = ProxyManager(get_http_proxies, limited_urls, {'refresh': True, 'interval': 30 * 60, 'delay': 8 * 60}, *purl1) else: proxymgr = None print proxymgr global useragents if ua_flag == 1: useragents = get_useragents(uurl1) else: useragents = [None] if useragents == []: useragents = [None] gtaskpool.runtasks(AccessUrls.taskGenerator(self,urls,extractSearchResults))
def gtaskmanager(self,engine_type): #task_log = 'task_log.log' task_log = None gtaskpool.setlogging(logging.INFO,task_log) purl1 = ["http://192.168.120.185:5500/get_google_http_proxy_list"] uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list" limited_urls = [ ('^https://search\.disconnect\.me', 1) ] global proxymgr proxymgr = ProxyManager(get_http_proxies, limited_urls, {'refresh': True, 'interval': 30 * 60, 'delay': 8 * 60}, *purl1) global useragents useragents = get_useragents(uurl1) if useragents == []: useragents = [None] gtaskpool.runtasks(self.task_generator(self,engine_type))
def gtaskmanager(self, engine_type): #task_log = 'task_log.log' task_log = None gtaskpool.setlogging(logging.INFO, task_log) purl1 = ["http://192.168.120.185:5500/get_google_http_proxy_list"] uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list" limited_urls = [('^https://search\.disconnect\.me', 1)] global proxymgr proxymgr = ProxyManager(get_http_proxies, limited_urls, { 'refresh': True, 'interval': 30 * 60, 'delay': 8 * 60 }, *purl1) global useragents useragents = get_useragents(uurl1) if useragents == []: useragents = [None] gtaskpool.runtasks(self.task_generator(self, engine_type))
gtaskpool.setlogging(logging.INFO) purl1 = ["http://192.168.120.17:8014/proxy/get_http_proxy_list"] purl2 = ["http://192.168.1.14:5500/get_http_proxy_list"] uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list" uurl2 = "http://192.168.1.14:5500/get_useragent_list" # Create a ProxyManager if you need limited_urls = [("^http://www\.baidu\.com/s\?wd=apple&pn=\d+$", 1)] proxymgr = ProxyManager( get_http_proxies, limited_urls, {"refresh": True, "interval": 30 * 60, "delay": 8 * 60}, *purl2 ) # Or if you don't want refresh proxies periodcally # proxymgr = ProxyManager(get_http_proxies, *purl2, limited_urls, \ # {'refresh': False}, *purl2) # A useragent list for http request if you need useragents = get_useragents(uurl2) if useragents == []: useragents = [None] fresult = open("result.txt", "w") fleft = open("left.txt", "w") # Optional args: # @max_ongoing_tasks (default to 1000) gtaskpool.runtasks(task_generator()) fresult.close() fleft.close()
while trycnt != max_try: res = task() res['try_idx'] = trycnt + 1 if trycnt+1 == max_try or res['finish']: res['last_try'] = True else: res['last_try'] = False log_task_result(res, task_log) if res['finish']: return trycnt += 1 def log_task_result(result, filehandle): result['ts'] = str(datetime.now()) jstr = json.dumps(result, ensure_ascii=False).encode('utf-8') filehandle.write(jstr + "\n") def runtasks(task_generator, task_log, max_try=10): def gen_task(): while True: try: task = task_generator.next() except StopIteration, e: return yield gtaskpool.Task(retry_task, [task, task_log, max_try]) gtaskpool.runtasks(gen_task())
while trycnt != max_try: res = task() res['try_idx'] = trycnt + 1 if trycnt + 1 == max_try or res['finish']: res['last_try'] = True else: res['last_try'] = False log_task_result(res, task_log) if res['finish']: return trycnt += 1 def log_task_result(result, filehandle): result['ts'] = str(datetime.now()) jstr = json.dumps(result, ensure_ascii=False).encode('utf-8') filehandle.write(jstr + "\n") def runtasks(task_generator, task_log, max_try=10): def gen_task(): while True: try: task = task_generator.next() except StopIteration, e: return yield gtaskpool.Task(retry_task, [task, task_log, max_try]) gtaskpool.runtasks(gen_task())
#!/usr/bin/env python # encoding: utf-8 import gtaskpool import requests import logging def task(n1, n2): logging.info("task(%s, %s): called", n1, n2) r = requests.get("http://www.baidu.com") print "task(%s, %s): response (len=%s): %s..." % \ (n1, n2, len(r.text), r.text[:min(100, len(r.text))]) logging.info("task(%s, %s): finished", n1, n2) def task_generator(): task_num = 10 for i in xrange(1, task_num + 1): yield gtaskpool.Task(task, [i, i]) if __name__ == "__main__": gtaskpool.setlogging(logging.INFO) gtaskpool.runtasks(task_generator())