def webdir_crawler_schedu(self): ''' web目录爆破任务调度 :return: ''' #根目录字典目录探测 first_crack_dir_pool = pool.Pool(self.concurrent_num) first_crack_dir_pool.map( self.__webdir_crawler_work, ['%s/' % str(dir_dic) for dir_dic in self.dir_dic]) self.__deal_exist_file(bModel=False) while not self.exist_dir_cache_que.empty(): dir = self.exist_dir_cache_que.get_nowait() ''' 判断当前文件路径长度 ''' if len(dir.split('/')) <= self.depth: #Bug 修改 #在已存在目录下探测一个绝对不存在的目录,判断状态 if self.__subdir_crawler_status(dir): dir_tmpdic = [ '%s/%s/' % (dir, dir_dic) for dir_dic in self.dir_dic ] path_pool = pool.Pool(self.concurrent_num) path_pool.map(self.__webdir_crawler_work, dir_tmpdic) self.__deal_exist_file(bModel=False)
def __init__(self, log_file_name="my.log"): # 默认的域名 self.host = "http://www.kuwo.cn" # 根据关键字key获取歌曲的rid值的json数据的接口 self.rid_url = "/api/www/search/searchMusicBykeyWord?key={}" # 根据rid获取歌曲下载链接的json数据的接口 self.mp3_url = "/url?rid={}&type=convert_url3&br=128kmp3" # 获取音乐榜 可以得到sourceid self.bang_menu = "/api/www/bang/bang/bangMenu" # 获取音乐信息的接口 self.music_info = "/api/www/music/musicInfo?mid={}" # 根据 musicid 获取歌词信息 self.song_lyric = "http://m.kuwo.cn/newh5/singles/songinfoandlrc?musicId={}" # 根据bangid 获取音乐列表 self.music_list = "/api/www/bang/bang/musicList?bangId={}&pn={}&rn={}" # 一些必要的请求头 self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36", "Referer": "http://www.kuwo.cn/search/list", # 这个请求头没有的话,会出现 403 Forbidden "csrf": "0HQ0UGKNAKR", # CSRF Token Not Found! # CSRF Token Not Found! "Cookie": "Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1584003311; _ga=GA1.2.208068437.1584003311; _gid=GA1.2.1613688009.1584003311; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1584017980; kw_token=0HQ0UGKNAKR; _gat=1", } # 多线程支持 self.mp3_q = queue.Queue() # gevent self.pool1 = pool.Pool() self.pool2 = pool.Pool() # 自己的代理池 self.proxies = [ { 'http': '116.114.19.204:443' }, { 'http': '101.231.104.82:80' }, { 'http': '116.114.19.211:443' }, { 'http': '84.17.47.190:80' }, ] # 日志文件名 # self.log_file_name = log_file_name self.f = open("{}/{}".format(BASE_DIR, log_file_name), "w", encoding="utf-8") self.f.write("log: use time second = minute = hour\n") # 过滤item["name"]中的无效字符 self.invalid_characters = r"[/\?]"
def __init__(self, page): self.url = 'http://book.zongheng.com/store/c0/c0/b0/u0/p{}/v9/s9/t0/u0/i1/ALL.html' # 页码变化 self.num = 1 self.page = page self.useragent = UserAgent() # self.connect = pymysql.connect(host='localhost', port=3306, user='******', password='******', db='scrapytest') # self.cur = self.connect.cursor() self.queue = queue.Queue() self.pool = pool.Pool(2) self.queue_detail = queue.Queue() self.pool_detail = pool.Pool(3)
def __init__(self): """ 初始化方法 """ MyBaseSpider.__init__(self) # api 调用 self.api = Music163ComAPI() # 自定义的下载器 self.downloader = Downloader() # 协程池 self.pool = pool.Pool() # 歌曲队列 self.m4a_q = queue.Queue() # 榜单池 self.bang_pool = pool.Pool()
def get_http_utilization_for_all_tasks(marathon_service_config, marathon_tasks, endpoint, json_mapper): """ Gets the mean utilization of a service across all of its tasks by fetching json from an http endpoint and applying a function that maps it to a utilization :param marathon_service_config: the MarathonServiceConfig to get data from :param marathon_tasks: Marathon tasks to get data from :param endpoint: The http endpoint to get the stats from :param json_mapper: A function that takes a dictionary for a task and returns that task's utilization :returns: the service's mean utilization, from 0 to 1 """ endpoint = endpoint.lstrip('/') utilization = [] service = marathon_service_config.get_service() monkey.patch_socket() gevent_pool = pool.Pool(20) jobs = [ gevent_pool.spawn(get_http_utilization_for_a_task, task, service, endpoint, json_mapper) for task in marathon_tasks ] gevent.joinall(jobs) for job in jobs: if job.value is not None: utilization.append(job.value) if not utilization: raise MetricsProviderNoDataError("Couldn't get any data from http endpoint {} for {}.{}".format( endpoint, marathon_service_config.service, marathon_service_config.instance, )) return mean(utilization)
def run(self): from gevent.server import StreamServer from gevent.pywsgi import WSGIServer from gevent.backdoor import BackdoorServer import gm.app # NOQA from gm.init_app import application import settings as st # wait for proxy self.ping_proxy() threads = [] logger.info('listening 0.0.0.0:%d', st.WORLD['port']) self.mainServer = StreamServer(('0.0.0.0', st.WORLD['port']), self.handle_client) threads.append(Greenlet.spawn(self.mainServer.serve_forever)) logger.info('listening %s:%d', st.WORLD['managehost'], st.WORLD['manageport']) threads.append( Greenlet.spawn( WSGIServer((st.WORLD['managehost'], st.WORLD['manageport']), application, spawn=pool.Pool(10)).serve_forever)) if os.environ.get("DOCKER_MANAGEHOST"): backdoorhost = "0.0.0.0" else: backdoorhost = "127.0.0.1" logger.info('listening %s:%d', backdoorhost, st.WORLD['backdoorport']) threads.append( Greenlet.spawn( BackdoorServer( (backdoorhost, st.WORLD['backdoorport'])).serve_forever)) # start cron thread import cron_settings # NOQA threads.append(Greenlet.spawn(self.heart_beat)) joinall(threads)
def start(self): start_time = time.time() self.start_time = int(start_time) log.debug("useful_proxy proxy verify start") self.stat = dict( total = 0, succ = 0, fail = 0, ) concurrency = ConfigManager.setting_config.setting.get("verify_useful_proxy_concurrency") task_pool = pool.Pool(concurrency) queue_size = self.queue.qsize() greenlet_list = [] for _ in range(queue_size): greenlet_list.append(task_pool.spawn(self.verify)) gevent.joinall(greenlet_list) end_time = time.time() elapsed_time = int(end_time - start_time) log.info('useful_proxy verify proxy finish, total:{total}, succ:{succ}, fail:{fail}, elapsed_time:{elapsed_time}s' .format(total=self.stat["total"], succ=self.stat["succ"], fail=self.stat["fail"], elapsed_time=elapsed_time))
def __init__(self, qqno, qqpwd, handler=None): self.handler = handler if handler else MessageHandner(self) self.uin = qqno self.qqpwd = qqpwd self.ptwebqq = "" self.psessionid = "" self.clientid = str(random.randint(1,99999999)) self.vfwebqq = "" self.vcode = "" self.vcode2 = "" self.cookiefile = "/tmp/cookies.lwp" self.cookiejar = cookielib.LWPCookieJar(filename=self.cookiefile) self.fakeid = "" self.friends = None self.friendindex = 1 self.uintoqq = {} self.referurl = "http://d.web2.qq.com/proxy.html?v=20110331002&callback=1&id=2" self.headers = { "User-Agent": "Mozilla/5.0 (X11; Linux i686; rv:16.0) Gecko/20100101 Firefox/16.0", "Referer": "http://d.web2.qq.com/proxy.html?v=20110331002&callback=1&id=2", "Content-Type": "application/x-www-form-urlencoded" } self.mq = queue.Queue(20) self.taskpool = pool.Pool(10) self.runflag = False from redis import Redis self.redisconn = Redis(host="localhost", db=10) self.logger = getLogger() self.session = requests.Session() self.session.headers = self.headers
def start(): global son, billing_cycles try: # 取前三个月账期 cur_month = datetime.strftime(datetime.now(), '%Y%m') cur_cycle = ''.join(['1', str(cur_month)[2:]]) billing_cycles = son.query(BillingCycle).filter( BillingCycle.billing_cycle_id < cur_cycle, BillingCycle.billing_cycle_id >= MIN_BILLING_CYCLE, ).order_by(BillingCycle.billing_cycle_id.desc()).limit(LIMIT_MONTH).all() # 补送红包表 bs = son.query(ARedPacketBs).filter( ARedPacketUser.red_id == ARedPacketBs.red_id ) packet_users = son.query( ARedPacketUser ).filter( ARedPacketUser.finish_flag == FINISH_FLAG_NORMAL, bs.exists(), ).all() logger.info('len(packet_list)=%d' % len(packet_users)) p = pool.Pool(500) jobs = [] for user in packet_users: jobs.append(p.spawn(process, user.red_id)) gevent.joinall(jobs) except Exception, ex: logger.error(ex.message)
def start(): global son, billing_cycles # 取前三个月账期 cur_month = datetime.strftime(datetime.now(), '%Y%m') cur_cycle = ''.join(['1', str(cur_month)[2:]]) billing_cycles = son.query(BillingCycle).filter( BillingCycle.billing_cycle_id < cur_cycle, BillingCycle.billing_cycle_id >= MIN_BILLING_CYCLE, ).order_by(BillingCycle.billing_cycle_id.desc()).limit(LIMIT_MONTH).all() min_date = min(map(lambda e: e.cycle_begin_date, billing_cycles)) max_date = max(map(lambda b: b.cycle_end_date, billing_cycles)) packet_users = son.query(ARedPacketUser).filter( ARedPacketUser.finish_flag == FINISH_FLAG_NORMAL, ARedPacketUser.wing_eff_date < max_date, ARedPacketUser.wing_exp_date >= min_date, ARedPacketUser.red_id.in_([100439718])).all() p = pool.Pool(500) jobs = [] for user in packet_users: jobs.append(p.spawn(process, user.red_id)) gevent.joinall(jobs) return 'pool tasks finished'
def main(): total_count = get_total_count() args = xrange(0, total_count, 10000) pool_size = 20 # pool_size = total_count/10000 p = pool.Pool(pool_size) p.map(update_url_batch, args)
def kickstart(itemset=None, poolsize=20): if itemset is None: itemset = getitems() logger.info("Found %s records to check", len(itemset)) time.sleep(3) start = datetime.now() results = Counter() count = 0 p = pool.Pool(poolsize) def wkfn(k): try: return (k, objfn(k)) except Exception as e: logger.exception("Failed on %r", k) return (k, e) try: for k, result in p.imap_unordered(wkfn, list(itemset)): count += 1 if isinstance(result, Exception): raise result if result is Status.ok or result is Status.rederive: itemset.remove(k) results[result] += 1 if count % 100 == 0: rate = count / max([(datetime.now() - start).total_seconds(), 1]) remaining = len(itemset) / rate logger.info("Processed %d records at %4.1f/s; %6.1fs remaining", count, rate, remaining) finally: logger.info("Finished %d records, %r", count, results)
def get_friends(user): api = twitter_api.Twitter(auth=twitter_api.OAuth( user.twitter_oauth.token, user.twitter_oauth.token_secret, twitter.consumer_key, twitter.consumer_secret, )) # Twitter allows lookup of 100 users at a time so we need to # chunk: chunk = lambda l, n: [l[x:x + n] for x in xrange(0, len(l), n)] friend_ids = list(api.friends.ids()['ids']) greenpool = pool.Pool(4) # Look up in parallel. Note that twitter has pretty strict 15 # requests/second rate limiting. friends = [] for result in greenpool.imap( lambda ids: api.users.lookup(user_id=','.join( str(id) for id in ids)), chunk(friend_ids, 100)): for r in result: friends.append( user_model.Friend( screen_name=r['screen_name'], name=r['name'], profile_image_url=r['profile_image_url'], )) return sorted(friends)
def scanOnce(cls, concurrentNum=multiprocessing.cpu_count()): concurrentPool = pool.Pool(concurrentNum) cnEtfList = ETF.getCnETFList() for etf in cnEtfList: concurrentPool.spawn(cls.scanOne, etf) concurrentPool.join() print('scan once')
def begin_get_predict_data(self): # logger.info('开始获取"%s"专家预测数据。' % self.lottery_name) times = 0 while 1: found_data = None find_data = {'lottery': self.lottery_name} filter_data = {'_id': 0} try: found_data = list( self.predict_urls_db.find(find_data, filter_data)) except Exception as e: logger.error(e) logger.info( 'db:%s find_data:%s, found_data:%s' % (self.predict_urls_db, find_data, len(list(found_data)))) if len(found_data) > 0: p = pool.Pool(100) for url_data in found_data: expert_id = url_data['expert_id'] data_type = url_data['data_type'] url = url_data['url'] gpd = GPD() gpd.set(self.lottery_name, expert_id, data_type, url) p.spawn(gpd.start) time.sleep(0.1) p.join() time.sleep(5) times += 1 if (not found_data) or times > 4: print('尝试次数:%s' % times) break
def get_transactions_of_block_at_height(self, height: gw.CoinBlockHeight) -> List[gw.Transaction]: block_hash = self._ltc_proxy.getblockhash(height) block = self._ltc_proxy.getblock(block_hash) get_transaction_tasks = pool.Pool() # litecoin server does not accept more than one parallel connection return [a for a in get_transaction_tasks.map(self.get_transaction, block['tx'])]
def run(self): scannerparam = self.scannerparam socket.setdefaulttimeout(scannerparam.timeout) _pool = pool.Pool(scannerparam.threadnum) time_start = time.time() print('开始执行...') params = [] iplist = IpHelper.get_ip_list(scannerparam) portlist = PortHelper.get_port_list(scannerparam) for ip in iplist: for port in portlist: params.append({'ip': ip, 'port': port}) self.taskcount = len(params) print('线程数:{0},ip总数:{1},待扫描任务总数:{2}'.format(scannerparam.threadnum, len(iplist), self.taskcount)) for p in params: _pool.spawn(self.scan, p) time_end = time.time() print('执行结束,共花费{0}秒'.format(time_end - time_start)) for x in self.open_ports: print("{0}:{1} open \n".format(x['ip'], x['port']))
def test_imap(self): p = pool.Pool(1) it = p.imap(divide_by, [1, 0, 2]) self.assertEqual(next(it), 1.0) self.assertRaises(ZeroDivisionError, next, it) self.assertEqual(next(it), 0.5) self.assertRaises(StopIteration, next, it)
def __init__(self, domain, flag): self.domain = domain self.flag = flag self.p = pool.Pool(20) self.tasks = [] self.subDomainEngine = [] self.zDomainList = []
def main(): # work() # return p = pool.Pool(200) while True: p.spawn(work)
def track(self) -> List[dict]: tracker_logger.info('Tracking branches for %s' % self._remote) include = re.compile(r'|'.join(self._branches or []), re.IGNORECASE) exclude = re.compile(r'|'.join(self._no_branches or []), re.IGNORECASE) branches = self._wrapper.get_branches(self._remote) filtered = [] for branch, authordate in branches: if self._branches and not include.search(branch): continue if self._no_branches and exclude.search(branch): continue if self._after_date and authordate < self._after_date: continue if self._before_date and authordate > self._before_date: continue filtered.append(branch) def track_branch(branch): return branch, self._track_branch(branch) gpool = pool.Pool(self._greenlets) for branch, tracked in gpool.imap(track_branch, filtered): yield self._remote, branch, tracked
def start_crawling(self, url: str, show_urls=False, allow_query_string=False): """Initialize job specific variables and start a scheduler to assign crawling jobs. Return all unique urls.""" self.base_url = urlparse(url) assert self.base_url.netloc != '', "Error: no url found in url {}".format( url) self.show_urls = show_urls # set to true if want to show urls on console self.allow_query_string = allow_query_string # set to true if a query string should be counted as a unique url self.worker_pool = pool.Pool(self.worker_pool_size) self.urlQ = queue.Queue( maxsize=self.queue_size ) # Queue to keep track of next urls to crawl, #TODO use database for more scalability. self.urlQ.put(url) # initialize queue with the first url self.crawled_urls = set( ) # Set to keep track of visited urls, #TODO use database for more scalability. self.crawled_urls_file = open( "urls.txt", "w+" ) # A file to keep saving intermediate results, # TODO use as cache to resume a crawler. self.url_crawled = event.Event( ) # Event to announce crawling finished by a gevent, consumed by scheduler to reassign jobs if new urls self._scheduler = gevent.spawn(self.scheduler) # start the scheduler self._scheduler.join() # wait for scheduler to finish return self.crawled_urls # return unique urls
def test_imap_unordered(self): p = pool.Pool(1) it = p.imap_unordered(divide_by, [1, 0, 2]) self.assertEqual(it.next(), 1.0) self.assertRaises(ZeroDivisionError, it.next) self.assertEqual(it.next(), 0.5) self.assertRaises(StopIteration, it.next)
def write_in_slave(self, song_id, start, end, hot=False): start, end = int(start), int(end) data_gen = self.post_data(0, start=start, end=end) pool = g_pool.Pool(size=self._pool_size) for _ in range(start, end, self._limit): pool.spawn(self.write_wrapper, song_id, data_gen, hot) pool.join()
def wait(self, timeout): p = pool.Pool() g = p.spawn(gevent.sleep, 10) try: p.join(timeout=timeout, raise_error=True) finally: g.kill()
def run(self, seedfile, progress_queue, output_queue): task_total = count_file_linenum(seedfile) proc_name = current_process().name sys.stdout = ProcessIO(output_queue) def progress_tracking(greenlet): count = getattr(progress_tracking, 'count', 0) + 1 setattr(progress_tracking, 'count', count) progress_queue.put((proc_name, count, task_total)) return greenlet po = pool.Pool(self.pool_size) with open(seedfile) as f: for line in f: g = po.apply_async(func=self.pool_task_with_timeout, args=(line, ), kwds=None, callback=self.callback) g.link(progress_tracking) po.add(g) try: po.join() except (KeyboardInterrupt, SystemExit) as ex: print(str(ex)) po.kill()
def process_ip_list(ip_list, output_file, start): ip_list_num = len(ip_list) for ip_pointer in range(start, start + ip_list_num, 1000): sem.acquire() print("[*] Start scanning at", ip_pointer) print(ip_port_dict) ip_port_dict.clear() with open("scan.log", "a") as f: f.write(str(ip_pointer)) pool_ = pool.Pool(200) gevent_list = [] for ip_addr in ip_list[ip_pointer:ip_pointer + 1000]: g = pool_.spawn(scan, ip_addr) gevent_list.append(g) gevent.joinall(gevent_list) headers = ["IP", "端口"] while flag[0] > 0: pass with open(output_file, 'a') as csv_file: csv_write = csv.writer(csv_file) if ip_pointer == 0: csv_write.writerow(headers) csv_write.writerows(list(ip_port_dict.items())) flag[0] = 1000 sem.release()
def test_imap_unordered(self): p = pool.Pool(1) it = p.imap_unordered(divide_by, [1, 0, 2]) self.assertEqual(six.advance_iterator(it), 1.0) self.assertRaises(ZeroDivisionError, six.advance_iterator, it) self.assertEqual(six.advance_iterator(it), 0.5) self.assertRaises(StopIteration, six.advance_iterator, it)
def start(): p = pool.Pool(pool_size) gevent.spawn(stop, p) while alive: st = time.time() p.spawn(work)
def run(self, proxyList): logger.info('Running Validator.') self.rator.begin() while 1: try: if proxyList: self.rator.pull_table(self.db.table) pen = len(proxyList) logger.info( 'Proxies from Collector is detected,length : %d ' % pen) pop_len = pen if pen <= VALIDATE_AMOUNT else VALIDATE_AMOUNT stanby_proxies = [proxyList.pop() for x in range(pop_len)] logger.info( 'Start to verify the collected proxy data,amount: %d ' % pop_len) gpool = pool.Pool(CONCURRENCY) gevent.joinall([ gpool.spawn(self.validate_proxy, i) for i in stanby_proxies if i ]) logger.info( 'Validation finished.Left collected proxies:%d' % len(proxyList)) time.sleep(VALIDATE_F) except Exception as e: logger.error('Error class : %s , msg : %s ' % (e.__class__, e)) self.rator.end() logger.info('Validator shuts down.') return