def loop_writer(self, transport, queue): """ 异步写入数据 :param trans: :param queue: :return: """ msg = queue.get() # msg 为 None表示已经读取完毕所有的 input message while self.connection_ok and (msg is not None): # print "Write Back Msg" try: transport.flush_frame_buff(msg) except: print_exception(info_logger) self.connection_ok = False break # 简单处理 if not self.alive: break msg = queue.get() if msg is None: info_logger.warning( "....Worker Connection To LB Failed, LoopRead Stop")
def close(self): # Close the queue. There are 2 possibilities: # 1. The file buffer is non-empty and there's a greenlet # emptying it. (See the feed greenlet in the put method.) # The greenlet is blocked puting data in the underlying # queue. We can set size to -1, marking us as closed and # close the file. The greenlet will check sise before # trying trying to read the file again. # 2. The file bugger is empty and there's no running greenlet. # We can set the size to -1 and close the file. # In either case, we'll empty the underying queue, both for # cleanliness and to unblock a greenlet, if there is one, so # it can die a normal death, if self.size < 0: return # already closed self.size = -1 self.file.close() queue = self.queue while queue.qsize(): queue.get() self.size_bytes = 0
def bind_queue(connection, queue): """binds a queue to the bindings identified in the doc""" bindings = queue.get('bindings') ch = connection.channel(synchronous=True) name = queue.get('queue') for binding in bindings: exchange = binding['exchange'] key = binding['routing_key'] logger.info("bind {} to {}:{}".format(name, exchange, key)) ch.queue.bind(name, exchange, key, nowait=False)
def randomT(queue, name): while not queue.empty(): t = queue.get(timeout=1) gevent.sleep(5) print "I am + " + name + " executing " + str(GreenletRequests.ng) GreenletRequests.ng += 1 gevent.sleep(0)
def putRequest(queue, payload=None): response = {} statusCode = {} data = {} while not queue.empty(): resourceURI = queue.get(timeout=1) response["Node"] = resourceURI try: if payload is None: r = requests.put(resourceURI, timeout=20) else: r = requests.put(resourceURI, data=payload, timeout=20) if r.headers["Content-Type"] == "application/json": data = r.json else: data = r.text response["StatusCode"] = r.status_code response["Data"] = data except requests.exceptions.Timeout: response["StatusCode"] = 408 response["Data"] = data except requests.exceptions.ConnectionError: response["Node"] = resourceURI statusCode["StatusCode"] = 404 response["Data"] = "n/a" GreenletRequests.NodeResponsesPost.append(response) print "Threaded PUT with ID " + str(GreenletRequests.npo) + " executed for " + resourceURI GreenletRequests.npo += 1 gevent.sleep(0)
def yielding_checked_fnwalk(path, fn, sleep_interval=0.01): try: parent, name = os.path.split(path) entry = scandir.GenericDirEntry(parent, name) if fn(entry): yield entry queue = gevent.queue.LifoQueue() if entry.is_dir(): queue.put(path) while True: try: path = queue.get(timeout=0) except gevent.queue.Empty: break else: for entry in scandir.scandir(path): if fn(entry): if entry.is_dir(): queue.put(entry.path) yield entry gevent.sleep(sleep_interval) except Exception as e: logging.exception( 'Exception while directory walking: {}'.format(str(e)))
def batch_fetch(self, queue, event, linger_ms, max_queued_messages): if queue.qsize() < max_queued_messages: event.wait(linger_ms / 1000) if event.is_set(): event.clear() batch_msgs = [queue.get() for _ in range(queue.qsize())] return batch_msgs
def listen(self, namespace, max_timeout): """Register to listen to a namespace and yield messages as they arrive. If no messages arrive within `max_timeout` seconds, this will yield a `None` to allow clients to do periodic actions like send PINGs. This will run forever and yield items as an iterable. Use it in a loop and break out of it when you want to deregister. """ queue = gevent.queue.Queue() namespace = namespace.rstrip("/") for ns in _walk_namespace_hierarchy(namespace): self.consumers.setdefault(ns, []).append(queue) try: while True: # jitter the timeout a bit to ensure we don't herd timeout = max_timeout - random.uniform(0, max_timeout / 2) try: yield queue.get(block=True, timeout=timeout) except gevent.queue.Empty: yield None # ensure we're not starving others by spinning gevent.sleep() finally: for ns in _walk_namespace_hierarchy(namespace): self.consumers[ns].remove(queue) if not self.consumers[ns]: del self.consumers[ns]
def wait(self, timeout): queue = gevent.queue.Channel() self.__waiters.append(queue) data = queue.get(timeout) if data is False: raise TimeoutError("The request timed out.") return data
def deleteRequest(queue): response = {} while not queue.empty(): resURI = queue.get(timeout=DMON_TIMEOUT) try: r = requests.delete(resURI, timeout=DMON_TIMEOUT) data = r.json() response['Node'] = resURI response['StatusCode'] = r.status_code response['Data'] = data except requests.exceptions.Timeout: response['Node'] = resURI response['StatusCode'] = 408 response['Data'] = 'n/a' except requests.exceptions.ConnectionError: response['Node'] = resURI response['StatusCode'] = 404 response['Data'] = 'n/a' GreenletRequests.NodeResponsesGet.append(response) # print 'Threaded DELETE with ID ' + str(GreenletRequests.nd) + ' executed for ' + resURI app.logger.info( '[%s] : [INFO] Thread DELETE with ID %s executed for %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(GreenletRequests.ng), resURI) GreenletRequests.nd += 1 gevent.sleep(0)
def test_helper_sync_recv_msg(self, queue): for tick in range(10): msg = queue.get() if msg is not None: break time.sleep(1) return msg
def resolve_once(record_type, domains, server_type, servers, timeout, strategy, wrong_answer): greenlets = [] queue = gevent.queue.Queue() try: for domain in domains: for server in servers: server_ip, server_port = server greenlets.append(gevent.spawn( resolve_one, record_type, domain, server_type, server_ip, server_port, timeout - 0.1, strategy, wrong_answer, queue=queue)) started_at = time.time() domains_answers = {} remaining_timeout = started_at + timeout - time.time() while remaining_timeout > 0: try: domain, answers = queue.get(timeout=remaining_timeout) domains_answers[domain] = answers if len(domains_answers) == len(domains): return domains_answers except gevent.queue.Empty: return domains_answers remaining_timeout = started_at + timeout - time.time() return domains_answers finally: for greenlet in greenlets: greenlet.kill(block=False)
def putRequest(queue, payload=None): response = {} statusCode = {} data = {} while not queue.empty(): resourceURI = queue.get(timeout=DMON_TIMEOUT) response['Node'] = resourceURI try: if payload is None: r = requests.put(resourceURI, timeout=20) else: r = requests.put(resourceURI, data=payload, timeout=20) if r.headers['Content-Type'] == 'application/json': data = r.json else: data = r.text response['StatusCode'] = r.status_code response['Data'] = data except requests.exceptions.Timeout: response['StatusCode'] = 408 response['Data'] = data except requests.exceptions.ConnectionError: response['Node'] = resourceURI statusCode['StatusCode'] = 404 response['Data'] = 'n/a' GreenletRequests.NodeResponsesPost.append(response) # print 'Threaded PUT with ID ' + str(GreenletRequests.npo) + ' executed for ' + resourceURI app.logger.info( '[%s] : [INFO] Thread PUT with ID %s executed for %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(GreenletRequests.ng), resourceURI) GreenletRequests.npo += 1 gevent.sleep(0)
def crawler(n): """ this is the worker routine, the heart of this solution the job is performed by the following steps: 1. take an url from the queue 2. make a request to this url 3. mark it as visited 4. check whether the response is ok to be parsed 5. if the url corresponds to a product page, then extract data from it 6. extract more urls from the current page and add them to the queue this is repeated continuously until the queue is empty """ while True: logger.info( 'links: [%d] pending, [%d] discovered, [%d] visited' % (queue.qsize(), len(discovered), len(visited)) ) url = queue.get() logger.info('crawler [%d] took [%s] from queue' % (n, url)) response = requests.get(url, verify=False) # no SSL validation visited.append(url) if response.status_code == requests.codes.ok: soup = Soup(response.content) if is_valid_product_page(url, response): data = extract_product_data(url, soup) csv.write(CSV_FORMAT % data) discover_links(url, soup) else: logger.warning('response not ok for [%s]' % url) queue.task_done()
def _download_helper(): while not queue.empty(): h = queue.get() if not h: break r = requests.get(VT_DOWNLOAD, params={"apikey": apikey, "hash": h}) open(h, "wb").write(r.content)
def loop(queue): while True: item = queue.get() try: f, args, kwargs = item gevent.spawn(f, *args, **kwargs) except Exception as e: sys.excepthook(*sys.exc_info())
def start_with_server(self, workflow): queue = gevent.queue.Queue() server = gevent.spawn(SockServer(MonitorServer, self, workflow).run, pipe=queue) port = queue.get() print "Server started on", port self.start(workflow) self.join() server.kill()
def __deliver(self, queue): need_deliver = [] while 1: item = queue.get() need_deliver.append(item) if len(need_deliver) >= 20: gevent.spawn(self.__dodeliver, need_deliver) need_deliver = []
def queue_pop(queue): while not stopped: try: return queue.get(timeout=5) except gevent.queue.Empty: continue if stopped: raise QueueStopped()
def spawner(queue): while 1: try: item = queue.get() except hub.LoopExit: logger.error("exit getter spawner...") return queue.task_done() gs.append(gevent.spawn(http_getter, item))
def get_messages(queue): while True: try: message = queue.get(timeout=90) except gevent.queue.Empty: return if message is StopIteration: return yield message
def response_generator(): ii = 0 while npending or not queue.empty(): ii += 1 result = queue.get() msg = '{} {}\n'.format(ii, result) print(msg, end='') yield msg t2 = datetime.datetime.now() print('====', t2 - t1)
def randomT(queue, name): while not queue.empty(): t = queue.get(timeout=1) gevent.sleep(5) # print 'I am + ' + name + ' executing ' + str(GreenletRequests.ng) app.logger.info( '[%s] : [INFO] %s executing %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), name, str(GreenletRequests.ng)) GreenletRequests.ng += 1 gevent.sleep(0)
def RegexpMatchWait(queue): if queue.empty(): gevent.sleep(1) return '' (tweet_dic, match_result) = queue.get() if tweet_dic is None or match_result is None: return "\n" result_dic = tweet_dic.copy() result_dic['match_result'] = match_result logging.info('waiting tweet text got: %s' % str(result_dic)) return "%s\n" % json.dumps(result_dic)
def resolve_google_ips(cls, create_tcp_socket): if cls.GOOGLE_IPS: return True LOGGER.info('resolving google ips from %s' % cls.GOOGLE_HOSTS) all_ips = set() selected_ips = set() for host in cls.GOOGLE_HOSTS: if re.match(r'\d+\.\d+\.\d+\.\d+', host): selected_ips.add(host) else: ips = resolve_google_ips(host) if len(ips) > 1: all_ips |= set(ips) if not selected_ips and not all_ips: LOGGER.fatal('failed to resolve google ip') return False queue = gevent.queue.Queue() greenlets = [] try: for ip in all_ips: greenlets.append(gevent.spawn(test_google_ip, queue, create_tcp_socket, ip)) deadline = time.time() + 5 for i in range(min(3, len(all_ips))): try: timeout = deadline - time.time() if timeout > 0: selected_ips.add(queue.get(timeout=1)) else: selected_ips.add(queue.get(block=False)) except: break if selected_ips: cls.GOOGLE_IPS = selected_ips LOGGER.info('found google ip: %s' % cls.GOOGLE_IPS) else: cls.GOOGLE_IPS = list(all_ips)[:3] LOGGER.error('failed to find working google ip, fallback to first 3: %s' % cls.GOOGLE_IPS) return True finally: for greenlet in greenlets: greenlet.kill(block=False)
def create_user_worker(dom, queue): collects = [] while True: item = queue.get() if item is None: break collects.append(item) if len(collects)==25: tasks.put((helper, dom, collects)) collects = [] if collects: tasks.put((helper, dom, collects))
def greenlet_worker(): while True: try: func = queue.get() if func is _STOP: break func() except self.queue_empty: continue except Exception as exc: log.warning("Exception in worker greenlet") log.exception(exc)
def greenlet_worker(): while True: try: func = queue.get() if func is _STOP: break func() except Empty: continue except Exception as exc: log.warning("Exception in worker greenlet") log.exception(exc)
def create_ssl_connection(client, timeout=None, max_timeout=16, max_retry=4, max_window=4): def _create_ssl_connection(address, timeout, queue): try: # create a ipv4/ipv6 socket object sock = client.create_upstream_sock() # set reuseaddr option to avoid 10048 socket error sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # resize socket recv buffer 8K->32K to improve browser releated application performance sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 32 * 1024) # disable negal algorithm to send http request quickly. sock.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, True) # set a short timeout to trigger timeout retry more quickly. sock.settimeout(timeout or max_timeout) # create ssl socket ssl_sock = ssl.wrap_socket( sock, do_handshake_on_connect=False, ssl_version=ssl.PROTOCOL_TLSv1) client.add_resource(ssl_sock) # start connection time record start_time = time.time() # TCP connect ssl_sock.connect(address) connected_time = time.time() # SSL handshake ssl_sock.do_handshake() handshaked_time = time.time() # record TCP connection time tcp_connection_time[address] = connected_time - start_time # record SSL connection time ssl_connection_time[address] = handshaked_time - start_time # sometimes, we want to use raw tcp socket directly(select/epoll), so setattr it to ssl socket. ssl_sock.sock = sock queue.put(ssl_sock) except socket.error as e: if LOGGER.isEnabledFor(logging.DEBUG): LOGGER.debug('[%s] upstream connection error' % repr(client), exc_info=1) # any socket.error, put Excpetions to output queue. queue.put(e) # reset a large and random timeout to the address ssl_connection_time[address] = max_timeout + random.random() addresses = [(google_ip, 443) for google_ip in GoAgentProxy.GOOGLE_IPS] for i in xrange(max_retry): window = min((max_window + 1) // 2 + i, len(addresses)) addresses.sort(key=ssl_connection_time.get) addrs = addresses[:window] + random.sample(addresses, window) queue = gevent.queue.Queue() for addr in addrs: gevent.spawn(_create_ssl_connection, addr, timeout, queue) for i in xrange(len(addrs)): result = queue.get() if not isinstance(result, socket.error): return result client.fall_back('connect to google ip failed')
def crawler(): global handler global crawled global DATA global ALREADY_CRAWLED global MAX_CRAWLS global ITEMS_COUNT handler.log("job started...") print "job started..." while 1: try: url = queue.get(timeout=1) if url in ALREADY_CRAWLED: continue content = handler.loadPage(url) content = content.decode('utf-8') doc = lxml.html.fromstring(content) imgs = doc.xpath("//div[@class='outletProductImage']/a/img") for img in imgs: img_url = main_domain + img.attrib['src'] if img_url: handler.addResults(img_url) ITEMS_COUNT += 1 print img_url handler.log(img_url) #add the next pages to crawl to the queue if crawled < MAX_CRAWLS: crawled += 1 ALREADY_CRAWLED.append(url) hrefs = doc.xpath("//div[@class='browsePageControls']/a[@class='control next']") for href in hrefs: href = href.attrib['href'] if href: next_url = main_domain+href if next_url not in ALREADY_CRAWLED: queue.put(next_url) break #take only the first nav on the top page else: raise gevent.queue.Empty except gevent.queue.Empty: break print "job done" handler.log("job done") print "so far crawled %s pages" % crawled handler.log("so far crawled %s pages" % crawled)
def drainQueue(): while True: f,a,kw,resume = queue.get() try: print("run",f) ret = f(*a,**kw) print("runnnn",ret) resume.set(ret) except Exception as e: print("boooo") import traceback traceback.print_exc() resume.set_exception(e)
def worker(self, thread_id, queue): \ # pylint: disable=unused-argument while True: try: spot_instance_request = queue.get() self.process_spot_instance_request(spot_instance_request) except Exception: self._logger.exception( 'Exception while processing spot instance request') finally: queue.task_done()
def create_queue(connection, queue): """creates a queue synchronously""" name = queue['queue'] logger.info("Create queue {}".format(name)) durable = bool(queue.get('durable', True)) auto_delete = bool(queue.get('auto_delete', False)) exclusive = bool(queue.get('exclusive', False)) passive = False nowait = False arguments = {} queue_args = [ 'x_dead_letter_exchange', 'x_dead_letter_routing_key', 'x_max_length', 'x_expires', 'x_message_ttl', ] for queue_arg in queue_args: key = queue_arg.replace('_', '-') if queue.get(queue_arg): arguments[key] = queue.get(queue_arg) ch = connection.channel(synchronous=True) ret = ch.queue.declare( queue=name, passive=passive, exclusive=exclusive, durable=durable, auto_delete=auto_delete, nowait=nowait, arguments=arguments ) name, message_count, consumer_count = ret log_message = "Queue {} - presently {} messages and {} consumers connected" logger.info(log_message.format(name, message_count, consumer_count))
def manage_webhook_data(queue): while True: qsize = queue.qsize() if qsize > 5000: log.warning("Queue length is at %s... this may be causing " + "a significant delay in notifications.", qsize) data = queue.get(block=True) obj = Events.event_factory(data) if obj is not None: for name, mgr in managers.iteritems(): mgr.update(obj) log.debug("Distributing event {} to manager {}.".format( obj.id, name)) log.debug("Finished distributing event: {}".format(obj.id))
def greenlet_worker(): while True: try: func = queue.get() try: if func is _STOP: break func() except Exception as exc: log.warning("Exception in worker greenlet") log.exception(exc) finally: del func # release before possible idle except self.queue_empty: continue
def loop_writer(self, transport, queue): """ 异步写入数据 :param trans: :param queue: :return: """ msg = queue.get() # msg 为 None表示已经读取完毕所有的 input message while self.connection_ok and (msg is not None): # print "Write Back Msg" try: transport.flush_frame_buff(msg) except: print_exception(info_logger) self.connection_ok = False break # 简单处理 if not self.alive: break msg = queue.get() if msg is None: info_logger.warning("....Worker Connection To LB Failed, LoopRead Stop")
def send_message(socket): global queue while True: try: if not queue.empty(): #print("QUEUE NOT EMPTY") message = queue.get(block=False) if not socket.closed: socket.send(json.dumps(message)) #print('Sent response') #We need a sleep call so that other greenlets can run gevent.sleep() except Exception as e: print("SEND: %s" % e) raise e
def writer_task(queue, state): client = boto3.client('logs') sort_func = itemgetter(0) logger.debug('Started writer task') for _ in infinity(): records = queue.get() assert records records.sort(key=sort_func) for unit_conf, unit_records in itertools.groupby(records, sort_func): unit_records = list(unit_records) logger.debug('Pushing records for unit "%s/%s": %s', unit_conf.name, unit_conf.unit, unit_records) push_records(client, unit_records, unit_conf, state) metrics.n_logs_sent += len(unit_records)
def gmap_unordered(func, iterable): """As per gmap(), but always lazy and yields (arg, result) in order of completion.""" iterable = list(iterable) queue = gevent.queue.Queue(len(iterable)) def gen_callback(arg): def callback(g): queue.put((arg, g)) return callback for arg in iterable: g = gevent.spawn(func, arg) g.link(gen_callback(arg)) seen = 0 while seen < len(iterable): arg, g = queue.get() seen += 1 yield arg, g.get()
def crawler(): global crawled while 1: try: u = queue.get(timeout=0) response = requests.get(u) print response.status_code, u for link in re.findall('<a href="(http.*?)"', response.content): if crawled < 10: crawled += 1 queue.put(link) except gevent.queue.Empty: break
def crawler(): global crawled while 1: try: u = queue.get(timeout=1) response = requests.get(u) print response.status_code, u for link in re.findall('<a href="(http.*?)"', response.content): if crawled < 10: crawled += 1 queue.put(link) except gevent.queue.Empty: break
def _download_helper(): t = time.time() while not queue.empty(): h = queue.get() if not h: break if h == "wait": time.sleep(max(0, 60 - time.time() + t)) t = time.time() continue if os.path.exists(h): print "skipping..", h continue r = requests.get(VT_DOWNLOAD, params={"apikey": apikey, "hash": h}) open(h, "wb").write(r.content)
def group_reduce(func, queue, timeout, group_size=2): """Chunk queue into given group_size, and map func onto the seq of chunks queue.get(timeout=timeout) returns data of form (ID, args) if the queue returns data quickly enough, otherwise function quits. Because this func consumes the queue, it decrements mutable_qsize. If timeout is 0, this function will block until queue is not empty""" while True: group = [] try: for _ in range(group_size): group.append(queue.get(timeout=timeout)) except gevent.queue.Empty: [queue.put(elem) for elem in group] break ids = tuple(x[0] for x in group) rv = func(*(x[1] for x in group)) yield (ids, rv)
def converter(queue): LOGGER.debug('converter started') while True: data = queue.get() LOGGER.debug('new data for conversion') if data == StopIteration: queue.task_done() break LOGGER.debug('flv file: %s' % path.abspath(data['source_file'].name)) LOGGER.debug('target file: %s' % data['target_file']) ffmpeg_args = [ 'ffmpeg', '-i', path.abspath(data['source_file'].name), '-vn', '-acodec', data['acodec'], '-aq', data['quality'], '-y', data['target_file'] ] p = subprocess.Popen(ffmpeg_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) fcntl.fcntl(p.stdin, fcntl.F_SETFL, O_NONBLOCK) fcntl.fcntl(p.stdout, fcntl.F_SETFL, O_NONBLOCK) p.stdin.close() output = "" while True: try: chunk = p.stdout.read(4096) if not chunk: break output += chunk except IOError: ex = sys.exc_info()[1] if ex[0] != errno.EAGAIN: raise sys.exc_clear() socket.wait_read(p.stdout.fileno()) p.stdout.close() data['source_file'].close() LOGGER.debug('convertion done') queue.task_done()
def test_upstreams(self): LOGGER.error('!!! test upstreams: %s' % self.upstreams) greenlets = [] queue = gevent.queue.Queue() good_upstreams = [] try: for server in self.upstreams: server_type, server_ip, server_port = server greenlets.append( gevent.spawn(resolve_one, dpkt.dns.DNS_A, 'onetwothreefour.fqrouter.com', server_type, server_ip, server_port, 3, 'pick-right', queue)) while True: try: server, answers = queue.get(timeout=2) if isinstance(answers, NoSuchDomain): LOGGER.error('%s test failed: no such domain' % str(server)) continue if len(answers) == 0: LOGGER.error('%s test failed: 0 answer' % str(server)) continue if len(answers) > 1: LOGGER.error('%s test failed: more than 1 answer' % str(server)) continue if '1.2.3.4' != answers[0]: LOGGER.error('%s test failed: wrong answer' % str(server)) continue LOGGER.info('%s is good' % str(server)) good_upstreams.append(server) if len(good_upstreams) > 5: self.upstreams = good_upstreams return except gevent.queue.Empty: return finally: for greenlet in greenlets: greenlet.kill(block=False) if not good_upstreams: LOGGER.info('!!! no good upstream !!!') sys.exit(1)
def resolve_once(record_type, domain, servers, timeout, strategy): greenlets = [] queue = gevent.queue.Queue() try: for server in servers: server_type, server_ip, server_port = server greenlets.append( gevent.spawn(resolve_one, record_type, domain, server_type, server_ip, server_port, timeout, strategy, queue)) try: server, answers = queue.get(timeout=timeout) if isinstance(answers, NoSuchDomain): raise answers return server, answers except gevent.queue.Empty: raise ResolveFailure() finally: for greenlet in greenlets: greenlet.kill(block=False)
def crawler(): '''A very simple queued gevent web crawler''' global crawled while 1: try: u = queue.get(timeout=1) response = requests.get(u) print(response.status_code) # Extract some links to follow for link in re.findall('<a href="(http.*?)"', response.content): # Limit to 10 pages (ignores links when the pool is already full) if crawled < 10: crawled += 1 queue.put(link) except gevent.queue.Empty: break
def getrequestFile(queue, output): response = {} while not queue.empty(): resURI = queue.get(timeout=1) app.logger.info( '[%s] : [INFO] Thread File GET with ID %s starts execution for %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(GreenletRequests.ng), resURI) hostURL = urlparse(resURI) hostID = hostURL.hostname logName = 'worker-%s.tar' % hostID logDump = os.path.join(output, logName) try: r = requests.get(resURI, timeout=DMON_TIMEOUT, stream=True) if r.status_code == 200: with open( logDump, 'wb') as out_file: # TODO investaigate chunck writter shutil.copyfileobj(r.raw, out_file) response['Node'] = resURI response['StatusCode'] = r.status_code response['LogName'] = logDump response['Headers'] = r.headers del r except requests.exceptions.Timeout: response['Node'] = resURI response['StatusCode'] = 408 response['LogName'] = logDump except requests.exceptions.ConnectionError: response['Node'] = resURI response['StatusCode'] = 404 response['LogName'] = logDump GreenletRequests.NodeResponsesGet.append(response) # print 'Threaded GET with ID ' + str(GreenletRequests.ng) + ' executed for ' + resURI app.logger.info( '[%s] : [INFO] Thread File GET with ID %s executed for %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(GreenletRequests.ng), resURI) GreenletRequests.ng += 1 gevent.sleep(0)
def writer(queue): conn = psycopg2.connect( database='warehouse', port='5433', user='******', password='******', host='127.0.0.1', ) cnt = 0 qw = [] fformat = qu.format with conn.cursor() as cur: while True: value = queue.get() cnt += 1 qw.append(fformat(value)) if cnt >= 5000: cur.execute(trans.format("\n".join(qw))) qw = [] cnt = 0
def crawler(): while 1: try: url = queue.get(timeout=0) print('Fetching ... %s' % url) response = requests.get(url) if response.status_code == requests.codes.ok: # Extract some links to follow for link in re.findall(URLREGEX, response.text): if link not in tranvered: tranvered[link] = True queue.put(getUrl(link)) else: print('\x1b[0;30;41mFAILED\x1b[0m with %d ... %s' % (response.status_code, url)) except gevent.queue.Empty: print('queue empty') break