def batch_fetch(self, queue, event, linger_ms, max_queued_messages): if queue.qsize() < max_queued_messages: event.wait(linger_ms / 1000) if event.is_set(): event.clear() batch_msgs = [queue.get() for _ in range(queue.qsize())] return batch_msgs
def close(self): # Close the queue. There are 2 possibilities: # 1. The file buffer is non-empty and there's a greenlet # emptying it. (See the feed greenlet in the put method.) # The greenlet is blocked puting data in the underlying # queue. We can set size to -1, marking us as closed and # close the file. The greenlet will check sise before # trying trying to read the file again. # 2. The file bugger is empty and there's no running greenlet. # We can set the size to -1 and close the file. # In either case, we'll empty the underying queue, both for # cleanliness and to unblock a greenlet, if there is one, so # it can die a normal death, if self.size < 0: return # already closed self.size = -1 self.file.close() queue = self.queue while queue.qsize(): queue.get() self.size_bytes = 0
def flush(self): """Forces a flush from the internal queue to the server""" queue = self.queue size = queue.qsize() queue.join(JOIN_TIMEOUT_SECONDS) # Note that this message may not be precise, because of threading. self.log.debug('successfully flushed about %s items.', size)
def crawler(n): """ this is the worker routine, the heart of this solution the job is performed by the following steps: 1. take an url from the queue 2. make a request to this url 3. mark it as visited 4. check whether the response is ok to be parsed 5. if the url corresponds to a product page, then extract data from it 6. extract more urls from the current page and add them to the queue this is repeated continuously until the queue is empty """ while True: logger.info( 'links: [%d] pending, [%d] discovered, [%d] visited' % (queue.qsize(), len(discovered), len(visited)) ) url = queue.get() logger.info('crawler [%d] took [%s] from queue' % (n, url)) response = requests.get(url, verify=False) # no SSL validation visited.append(url) if response.status_code == requests.codes.ok: soup = Soup(response.content) if is_valid_product_page(url, response): data = extract_product_data(url, soup) csv.write(CSV_FORMAT % data) discover_links(url, soup) else: logger.warning('response not ok for [%s]' % url) queue.task_done()
def manage_webhook_data(queue): while True: qsize = queue.qsize() if qsize > 5000: log.warning("Queue length is at %s... this may be causing " + "a significant delay in notifications.", qsize) data = queue.get(block=True) obj = Events.event_factory(data) if obj is not None: for name, mgr in managers.iteritems(): mgr.update(obj) log.debug("Distributing event {} to manager {}.".format( obj.id, name)) log.debug("Finished distributing event: {}".format(obj.id))
def init(): #queue init #main.queue.put("") #main.pool.spawn(getLink).join() #give worker pool print('start crwaling') #while not pool.free_count() == 15: while not queue.empty(): gevent.sleep(0.8) for x in range(0, min(queue.qsize(), pool.free_count())): pool.spawn(getData) #wait for everything complete pool.join()
def runloop(self): i = self.i queue = self.queue multiplier = self.options.multiplier rate_limit = self.options.rate_limit location_grep = self.options.location_grep incoming_requests_counter = Counter('input') record_file = None if self.options.record_file: record_file = open(self.options.record_file, 'w') if statsd_client: def on_tick(): statsd_client.incr('backlog', queue.qsize()) incoming_requests_counter.on_tick = on_tick drop_counter = Counter('dropped') multiplied_output_counter = Counter() logger.info('Listener %d started', i) len_limit = self.options.backlog - self.options.backlog_breathing_space while self.running: q = self.next_query() if not q: continue if location_grep and not self.filter_by_location(q, location_grep): continue if record_file: print >> record_file, q incoming_requests_counter.count() logger.debug('Listener %d got %s', i, q) if queue: for _ in xrange(multiplier): multiplied_output_counter.count() if rate_limit > 0 and multiplied_output_counter.v >= rate_limit: continue if queue.qsize() > len_limit: drop_counter.count() else: queue.put(q)
def runloop(self): i = self.i queue = self.queue multiplier = self.options.multiplier rate_limit = self.options.rate_limit location_grep = self.options.location_grep incoming_requests_counter = Counter('input') record_file = None if self.options.record_file: record_file = open(self.options.record_file, 'w') if statsd_client: def on_tick(): statsd_client.incr('backlog', queue.qsize()) incoming_requests_counter.on_tick = on_tick drop_counter = Counter('dropped') multiplied_output_counter = Counter() logger.info('Listener %d started', i) len_limit = self.options.backlog - self.options.backlog_breathing_space while self.running: q = self.next_query() if not q: continue if location_grep and not self.filter_by_location(q, location_grep): continue if record_file: print >>record_file, q incoming_requests_counter.count() logger.debug('Listener %d got %s', i, q) if queue: for _ in xrange(multiplier): multiplied_output_counter.count() if rate_limit > 0 and multiplied_output_counter.v >= rate_limit: continue if queue.qsize() > len_limit: drop_counter.count() else: queue.put(q)
def scrape_base_url(): global data startTime = datetime.now() tree = html.fromstring(session.get(base_url).text) func = lambda x: queue.put_nowait((parse_comp, { 'url': domain + x.xpath('./@href')[0], 'name': x.xpath('./text()')[0] })) [ func(x) for x in tree.xpath('//div[@class="st-text"]//td/a') if x.xpath('./text()') != [] ] while not queue.empty() and not pool.full(): for x in xrange(0, min(queue.qsize(), pool.free_count())): t = queue.get_nowait() pool.start(pool.spawn(t[0], t[1])) pool.join() print 'Time Taken : ', datetime.now() - startTime with open('data.json', 'w') as fp: json.dump(data, fp)
def init_queue_with_item(queue, item=None): # drain out queue while queue.qsize() > 0: queue.get() if item: queue.put(item)
print "job done" handler.log("job done") print "so far crawled %s pages" % crawled handler.log("so far crawled %s pages" % crawled) queue.put(start_url_1) queue.put(start_url_2) pool.spawn(crawler) handler = Handler() print 'starting Crawler...' handler.log('starting Crawler...') while not queue.empty() and not pool.free_count() == workers_count: gevent.sleep(0.8) for x in xrange(0, min(queue.qsize(), pool.free_count())): pool.spawn(crawler) #wait for jobs to finish pool.join() print "Done" handler.log("Done+\n") print '\n' print "collected %s imgs" % ITEMS_COUNT handler.log("collected %s imgs" % ITEMS_COUNT) print "see generated output and log files" handler.close() #close the IO files
def on_tick(): statsd_client.incr('backlog', queue.qsize())
} try: r = requests.get('http://store.nike.com/cn/zh_cn/', proxies=proxies, timeout=(3, 1)) except requests.exceptions.ConnectionError: return except requests.exceptions.ReadTimeout: return end = time.time() delay = '{:.0f}ms'.format((end-start)*1000) queue.put([index, delay]) if __name__ == '__main__': with open('give.txt', 'r') as f: ips = f.read().strip().split('\n') pool = gevent.pool.Pool(len(ips)) queue = gevent.queue.Queue() for index, ip in enumerate(ips): pool.apply_async(ip_delay, (index, ip)) pool.join() # ip_delay(00, None) nums = [] while True: if queue.qsize() > 0: task = queue.get() print(task) nums.append(task[0]) else: break nums.sort() print(nums)