class ProgressStream(Future): def __init__(self): super().__init__() self._progress = Queue() self._progress_task = create_task(self._progress.get()) self._complete_task = create_task(self) def __aiter__(self): return self async def __anext__(self): if self.done(): raise StopAsyncIteration done, pending = await wait({ self._complete_task, self._progress_task }, return_when=FIRST_COMPLETED) if self._complete_task in done: self._progress_task.cancel() self._complete_task.result() # throws an exception (when necessary) raise StopAsyncIteration else: progress = self._progress_task.result() self._progress_task = create_task(self._progress.get()) return progress def write(self, item): if self.done(): raise InvalidStateError if self._progress.qsize() > 0: self._progress.get_nowait() self._progress.put_nowait(item)
async def run(self, queue_in: asyncio.Queue, queue_out: asyncio.Queue): while True: print(" ... runing env update") try: color = await asyncio.wait_for(queue_out.get(), timeout=1.0) dir_move = await asyncio.wait_for(queue_out.get(), timeout=1.0) except asyncio.TimeoutError: print("Timeout!") print(f"Painted {len(self.painted_panel)} panels!") break self.update([color, dir_move]) next_input = self.get_color() await queue_in.put(next_input)
async def schedule(chan: Queue, gen: Callable[..., Awaitable[T]]) -> AsyncIterator[T]: it, curr = count(), -1 prev: Task = create_task(sleep(inf)) while True: done, pending = await wait((chan.get(), prev), return_when=FIRST_COMPLETED) for p in pending: p.cancel() for d in await gather(*done): if type(d) is Signal: sig = cast(Signal, d) curr = i = next(it) async def d_gen(*args: Any, **kwargs: Any) -> Tuple[int, T]: ret = await gen(*args, **kwargs) return i, ret prev = create_task(d_gen(*sig.args, **sig.kwargs)) else: prev = create_task(sleep(inf)) c, ret = d if c == curr: yield ret
class CyclicQueuePool: """Cyclic queue pool of connections.""" def __init__(self, connector, pool_size, connection_cls): self.pool_size = pool_size self.pool = Queue(pool_size) for _ in range(pool_size): self.pool.put_nowait(connection_cls(connector)) async def acquire(self, _urlparsed: ParseResult = None): """Acquire connection.""" return await self.pool.get() async def release(self, conn): """Release connection.""" return self.pool.put_nowait(conn) def is_all_free(self): """Indicates if all pool is free.""" return self.pool_size == self.pool.qsize() def free_conns(self) -> int: return self.pool.qsize() async def cleanup(self): """Get all conn and close them, this method let this pool unusable.""" for _ in range(self.pool_size): conn = self.pool.get() conn.close()
class HTTPConcurrentPoll(HTTPPoll): """Polls each address concurrently in it's own Task""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._queue = Queue() async def _poll_address(self, address: str, header=None): while True: data = await self._read_address(address, header) await self._queue.put(data) await asyncio.sleep(self.sleep) async def read(self, header=None) -> AsyncIterable[str]: tasks = asyncio.gather(*(self._poll_address(address, header) for address in self.address)) try: while not tasks.done(): with suppress(asyncio.exceptions.TimeoutError): yield await asyncio.wait_for(self._queue.get(), timeout=1) finally: if not tasks.done(): tasks.cancel() with suppress(CancelledError): await tasks elif tasks.exception() is not None: raise tasks.exception()
async def consume(queue: asyncio.Queue): # do things # ... while True: msg = queue.get() if msg is None: break
async def __task_trade_receiver(self, name :str, qtrade :asyncio.Queue, qcandle:asyncio.Queue )->None: # Обрабатываем сделки while not self.__estop.is_set(): # Ждем сделку flag,trade = await wait_first([ qtrade.get(), self.__estop.wait() ]) if not flag: break # Обновляем свечу key = f"{name}:{trade.symbol.name}" async with self.__lock: candle = self.__candles.get(key) if not candle: candle = self.__candles[key] = Candle(trade.symbol, self.period) self.__tasks.append( asyncio.create_task(candle.start(qcandle)) ) await candle.update(trade) # Сбор статистики if self.__perfomance: await self.__perfomance.add("trades") # Показываем сделку if self.show_trades: self.logger.info(f"Trade <{name}>: {trade}")
class ModelThreadPool: def __init__(self): self.bs_in_queue = Queue() self.bs_out_queue = Queue() thread = threading.Thread(target=self.bubble_sorter, args=()) thread.daemon = True thread.start() # Repeat the process for the rest of the threads def input_converter(self): pass def bubble_sorter(self): fn = lambda x: 1 while True: if self.bs_queue.empty(): continue num_wires = self.bs_queue.get() self.bs_out_queue.put(fn(num_wires)) def query_bubble_sorter(self, num_wires): self.bs_in_queue.put(num_wires) sleep(2) return self.bs_out_queue.get()
async def _(cid: int, quality: Optional[int] = 4): waiter = Queue() await jobs.put((cid, quality, waiter)) try: return await wait_for(waiter.get(), 10) except TimeoutError: raise HTTPException(status.HTTP_503_SERVICE_UNAVAILABLE, 'No available workers!')
class Gateway(MqttBase): __slots__ = ['gwid', '__logger', '__last_uptime', '__reboot_queue'] def __on_heartbeat(self, client, userdata, msg): self.__logger.debug("saw gateway heartbeat") heartbeat_json = json.loads(msg.payload) uptime = heartbeat_json['sysinfo']['uptime'] if uptime < self.__last_uptime: self.__logger.debug('gateway uptime went backwards, probably rebooted') self.event_loop.call_soon_threadsafe(self.__reboot_queue.put_nowait, True) self.__last_uptime = uptime def __init__(self, host: str, gwid: str): self.__last_uptime = -1 self.__reboot_queue = Queue() self.gwid = gwid heartbeat_topic = '%s/%s/%s' % (TOPICROOT, gwid, HEARTBEAT) super().__init__(host, id=mqttbase.create_client_id('gwctrl'), topics=[heartbeat_topic]) self.__logger = logging.getLogger('gwctrl') self.mqtt_client.message_callback_add(heartbeat_topic, self.__on_heartbeat) async def reboot(self): self.__logger.debug('triggering gateway reboot') self.mqtt_client.publish('%s/%s/ctrl/%s' % (TOPICROOT, self.gwid, CTRL_REBOOT)) return await asyncio.wait_for(self.__reboot_queue.get(), 5 * 60)
class Subscription(object): def __init__(self): self._count = 0 self._queue = Queue() async def __aenter__(self): self._count += 1 return self async def __aexit__(self, ext_type, exc, tb): self._count -= 1 def __aiter__(self): return self async def __anext__(self): try: result = await wait_for(self._queue.get(), 5) except TimeoutError: return None else: return result async def notify(self, value): if self._count > 0: await self._queue.put(value)
async def send_task( s_sender: asyncio.StreamWriter, q: asyncio.Queue, e: asyncio.Event, delimiter: bytes, timeout=None, ): print("[SEND][INFO] Started") try: while True: try: n = await asyncio.wait_for(q.get(), timeout) q.task_done() except asyncio.TimeoutError: if e.is_set(): print(SharedData.bold("[SEND][INFO] Event set!")) return else: try: await tcp_send(n, s_sender, delimiter, timeout) except asyncio.TimeoutError: # really just want to use logging and dump logs in other thread.. print(SharedData.red("[Send][CRIT] Connection Broken!")) break except Exception: print(SharedData.bold("[SEND][CRIT] Stopping SEND!")) e.set() raise
async def run(self, connection: OptolinkConnection, command_queue: asyncio.Queue): connection.flush() while True: # poll start bytes (0x05) and discard them byte = await connection.read() if byte[0] != 0x05: # we are not in synchronization phase and received a byte other than # the synchronization byte -> just wait for the next byte continue # when there is at least one command waiting in the queue, start the communication if not command_queue.empty(): connection.write(b"\x01") # TODO: start measuring utilization here try: while True: cmd, fut = await asyncio.wait_for(command_queue.get(), timeout=0.5) connection.write(cmd.get_command_bytes()) val = await connection.read( cmd.get_expected_bytes_count()) if all(it == 0x05 for it in val): fut.set_exception(Exception("Command failed")) # we must synchronize again break else: if fut.done(): print(fut.result()) raise Exception("Future was already done") fut.set_result(cmd.handle_result(val)) except asyncio.TimeoutError: continue finally: pass
class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. self.session = aiohttp.ClientSession(loop=loop) # Put (URL, max_redirect) in the queue. self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): """Run the crawler until all work is done.""" workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] # When all work is done, exit. yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_redirect = yield from self.q.get() # Download page and add new links to self.q. yield from self.fetch(url, max_redirect) # 把新links加入q后再task_done() self.q.task_done()
def breadthfirst(bt): """breadthfirst: binary tree -> list[Node] Purpose: Runs a breadth first search on a binary tree Consumes: a binary tree object Produces: a list of Nodes in breadth first search order Example: A breadthfirst( / \ ) -> [A B C] B C If tree is empty, should return an empty list. If the tree is null, you should throw InvalidInputException. """ if bt is None: raise InvalidInputException("Input is None") if bt.isEmpty(): return [] Q = Queue() qlist = [] qlist.append(bt.root()) Q.put(bt.root()) while not Q.empty(): node = Q.get() if bt.hasLeft(node): Q.put(bt.left(node)) qlist.append(bt.left(node)) if bt.hasRight(node): Q.put(bt.right(node)) qlist.append(bt.right(node)) return qlist
class NonBlockingStreamReader: ''' Purpose: Class for handling STDOUT stream without blocking the entire thread. ''' def __init__(self, stream): ''' stream: the stream to read from. Usually a process' stdout or stderr. ''' self._s = stream self._q = Queue() def _populateQueue(stream, queue): ''' Collect lines from 'stream' and put them in queue. ''' while True: line = stream.readline() if line: queue.put(line) else: print("queue ended") break #raise UnexpectedEndOfStream self._t = Thread(target=_populateQueue, args=(self._s, self._q)) self._t.daemon = True self._t.start() #start collecting lines from the stream def readline(self, timeout=None): try: return self._q.get(block=timeout is not None, timeout=timeout) except: #Empty: return None
async def event_runner(self, exit_signal_status, queue: asyncio.Queue): while not exit_signal_status(): event_context: InternalEvent try: event_context: T.NamedTuple[InternalEvent] = await asyncio.wait_for(queue.get(), 3) except asyncio.TimeoutError: if exit_signal_status(): break else: continue if event_context.name in self.registeredEventNames: for event in list(self.event.values())\ [self.registeredEventNames.index(event_context.name)]: if event: # 判断是否是 []/{} for pre_condition, run_body in event.items(): try: condition_result = (not pre_condition) or (pre_condition(event_context.body)) except Exception as e: self.throw_exception_event(event_context, queue, e) continue if condition_result: EventLogger.info(f"handling a event: {event_context.name}") asyncio.create_task(self.main_entrance( run_body, event_context, queue ))
class Sender(): def __init__(self, name: str): self.name = name self.frames = None self.queue = Queue() async def send(self, frames: list, channel: 'Channel'): await asyncio.sleep(1) self.frames = frames await channel.connect(self) print(self.name, " :\tFrames received form network layer!") for i in range(0, len(frames)): copy = self.frames[i] # print(copy) if self.queue.empty(): await self.queue.put(copy) print(self.name, " :\tFrame sent in the channel!") try: await asyncio.wait_for(channel.transmit(self), timeout=10) val = await asyncio.wait_for(self.queue.get(), timeout=10) print(self.name, " :\thas received", val, "from", val.source_address) except asyncio.TimeoutError as e: print(self.name, " :\tTimed Out!") await asyncio.sleep(3)
class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. self.session = aiohttp.ClientSession(loop=loop) # Put (URL, max_redirect) in the Queue self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): '''Run the crawler untill all work is done.''' workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] # When all work is done, exit. yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_redirect = yield from self.q.get() # Download page and add new links to self.q yield from self.fetch(url, max_redirect) self.q.task_done() @asyncio.coroutine def fetch(self, url, max_redirect): # Handle redirects ourselves. response = yield from self.session.get(url, allow_redirects=False) try: if is_redirect(response): if max_redirect > 0: next_url = response.headers['location'] if next_url in self.seen_urls: # We have done this before. return # Remember we have seen this url. self.seen_urls.add(next_url) # Follow the redirect. One less redirect remains. self.q.put_nowait((next_url, max_redirect - 1)) else: links = yield from self.parse_links(response) # Python set-logic: for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: # Return connection to pool. yield from response.release()
class Receiver(): def __init__(self, name): self.name = name self.queue = Queue() self.buffer = [] async def conn(self, channel: 'Channel'): await asyncio.sleep(1) await channel.connect(self) async def recv(self, channel: 'Channel'): val = await asyncio.wait_for(self.queue.get(), timeout=5) if not self.buffer: self.buffer.append(val) print(self.name, " :\thas received frame with seq no.", val.payload['seq_no'], "from", val.source_address) elif self.buffer[-1].payload['seq_no'] == val.payload['seq_no']: print(self.name, " :\thas already received this frame, hence discarded.") else: print(self.name, " :\thas received frame with seq no.", val.payload['seq_no'], "from", val.source_address) self.buffer.append(val) ack = Frame() if val.payload['seq_no'] == '1': ack.setData("0") else: ack.setData('1') ack.source_address = self.name ack.destination_address = val.source_address await self.queue.put(ack)
class Waiter(BaseHandler): """ The Waiter handler allows an event handler to block until a particular stanza has been received. The handler will either be given the matched stanza, or ``False`` if the waiter has timed out. :param string name: The name of the handler. :param matcher: A :class:`~slixmpp.xmlstream.matcher.base.MatcherBase` derived object for matching stanza objects. :param stream: The :class:`~slixmpp.xmlstream.xmlstream.XMLStream` instance this handler should monitor. """ def __init__(self, name, matcher, stream=None): BaseHandler.__init__(self, name, matcher, stream=stream) self._payload = Queue() def prerun(self, payload): """Store the matched stanza when received during processing. :param payload: The matched :class:`~slixmpp.xmlstream.stanzabase.ElementBase` object. """ self._payload.put_nowait(payload) def run(self, payload): """Do not process this handler during the main event loop.""" pass @asyncio.coroutine def wait(self, timeout=None): """Block an event handler while waiting for a stanza to arrive. Be aware that this will impact performance if called from a non-threaded event handler. Will return either the received stanza, or ``False`` if the waiter timed out. :param int timeout: The number of seconds to wait for the stanza to arrive. Defaults to the the stream's :class:`~slixmpp.xmlstream.xmlstream.XMLStream.response_timeout` value. """ if timeout is None: timeout = slixmpp.xmlstream.RESPONSE_TIMEOUT stanza = None try: stanza = yield from self._payload.get() except TimeoutError: log.warning("Timed out waiting for %s", self.name) self.stream().remove_handler(self.name) return stanza def check_delete(self): """Always remove waiters after use.""" return True
async def worker( name: str, to_run: asyncio.Queue, to_write: asyncio.Queue, schema: quiz.Schema, executor: quiz.execution.async_executor, shutdown: asyncio.Event, request_pending: asyncio.Event, run_graphql_with_backoff: Callable[ [quiz.execution.async_executor, str, str], quiz.execution.RawResult ], ): """worker runs Github metadata requests until shutdown More specifically until the shutdown event fires it repeatedly: 1. pulls a request from the to_run queue 2. sets request pending 3. runs the request 4. clears request pending 5. pushes successful request response exhcanges to the to_write queue """ queue_wait_timeout_seconds = 2 while True: if shutdown.is_set(): log.debug(f"{name} shutting down") break try: request: Request = await asyncio.wait_for( to_run.get(), queue_wait_timeout_seconds ) except asyncio.TimeoutError: log.debug(f"{name} no new requests after {queue_wait_timeout_seconds}s") continue with event_in_progress(request_pending): # TODO: retry if request fails due to rate limit or intermittant error try: gql_query = str(schema.query[request.graphql]) assert str(MISSING) not in gql_query log.info(f"{name} running {request.log_str}") log.debug(f"{name} {request.log_id} gql_query is: {gql_query}") result: quiz.execution.RawResult = await run_graphql_with_backoff( executor, name, gql_query ) response: Response = Response(resource=request.resource, json=result) log.debug( f"{name} for {request.log_id} queued response {response.log_str} to write" ) # write non-empty responses to stdout assert response to_write.put_nowait(RequestResponseExchange(request, response)) except Exception as err: log.error(f"{name} error running {request.log_id}\n:{exc_to_str()}") # Notify the queue that the "work item" has been processed. to_run.task_done()
async def queue_batching_fulfiller( batch_processor: ty.Callable[[list], list], queue: asyncio.Queue, future_to_request_map: ty.Dict[asyncio.Future, ty.Any], batch_wait_s: float = _DEFAULT_BATCHING_WINDOW_SECONDS, max_batch_size: int = _DEFAULT_MAX_BATCH_SIZE, logging_name: str = "", ): """This is the long-running async batching loop that takes Future requests over a Queue, batches them together, and sends them to a batch processor. """ if not logging_name: logging_name = str(batch_processor) try: logger.debug( f"Entering new batching loop for {logging_name} with key attributes " f"batching window {batch_wait_s} seconds" ) while True: # wait for at least one request logger.debug(f"Indefinitely awaiting the next queued request for {logging_name}") request_futures = [await queue.get()] logger.debug(f"Received a queued request for {logging_name}") start_time = default_timer() # wait for as many requests as we can, up to the batch request limit, within a small batching window try: while len(request_futures) < max_batch_size: time_left = batch_wait_s - (default_timer() - start_time) logger.debug( f"Waiting for queue for {time_left} seconds in order to process a larger batch." ) request_futures.append(await asyncio.wait_for(queue.get(), time_left)) except asyncio.TimeoutError: pass logger.debug(f"Received {len(request_futures)} requests before closing the batch.") logger.debug("Starting batch processor!") results = batch_processor([future_to_request_map.pop(fut) for fut in request_futures]) logger.debug("Finished batch processor!") # we should always have received a result for every request we made... assert len(results) == len(request_futures) for i in range(len(request_futures)): fut = request_futures[i] result = results[i] fut.set_result(result) logger.debug(f"Finshed setting all {len(request_futures)} future results") except asyncio.CancelledError: pass except Exception as e: traceback.print_exc() raise e # this is for easier debugging inside async tasks finally: logger.info(f"Exiting the Dynamo BatchGet task for {logging_name}")
async def test_src_to_bot(patched_bot, bot_alert_queue: Queue, source_data): source_cls, (_, message) = source_data message = json.dumps(message) await patched_bot.alert(message) try: await asyncio.wait_for(bot_alert_queue.get(), 1.5) bot_alert_queue.task_done() except asyncio.TimeoutError: raise AssertionError
class MessageHandler(ws.WS): def __init__(self): self.queue = Queue() def get(self): return self.queue.get() def on_message(self, websocket, message): return self.queue.put(message)
class Dispatcher: def __init__(self): self.bindings = [] self.queue = Queue() def bind(self, action, filt=Filter()): # The default filter will always return True self.bindings.append((action, filt)) def unbind(self, action): for action, filt in list(self.bindings): self.bindings.remove((action, filt)) return True return False def dispatch(self, event, time=10): for action in (a for a, f in self.bindings if f.check(event)): LOG.debug("Dispatching {}".format(str(action))) try: self.queue.put_nowait( (functools.partial(action, event), timeout(time))) except QueueFull: LOG.error("The unbounded queue is full! Pretty weird, eh?") def dispatch_sync(self, event, time=10): loop = get_event_loop() for target in (a for a, f in self.bindings if f.check(event)): LOG.debug("Dispatching {} synchronously".format(str(target))) try: if iscoroutinefunction(target): # TODO: This doesn't really work how we want... loop.call_soon(target) else: target(event) except: LOG.exception( "Error while running {} in synchronous dispatch:".format( target)) @coroutine def run(self): while True: func, tout = yield from self.queue.get() try: if not hasattr(func, "__name__"): setattr(func, "__name__", "<unknown>") with tout: res = yield from coroutine(func)() while iscoroutine(res): with tout: res = yield from res except: LOG.exception( "Error while running {} from dispatch queue:".format(func))
async def drain_queue_async( cls, q: asyncio.Queue, timeout: Optional[int] = None) -> Tuple[bool, list]: try: elem = await asyncio.wait_for( q.get(), timeout=timeout) if timeout else await q.get() return cls.__handle_queue_update(q, elem) except TimeoutError: return False, []
class Listener: def __init__(self): self._messages = Queue() def __call__(self, channel, message): self._messages.put_nowait((channel, message)) def get(self): return self._messages.get()
class Message(ws.WS): def __init__(self, loop): self.queue = Queue(loop=loop) def get(self): return self.queue.get() def on_message(self, websocket, message): self.queue.put_nowait(message)
class VoteController(object): def __init__(self, app, workers=5, timeout=2, max_size=1000): self.queue = Queue() self.workers_size = workers self.workers = [] self.timeout = timeout self.max_size = max_size app.on_startup.append(self.start) app.on_cleanup.append(self.stop) def start(self, app): for i in range(self.workers_size): self.workers.append(app.loop.create_task(self._consumer(i))) def stop(self, app): for worker in self.workers: worker.cancel() async def insert(self, obj): await self.queue.put(obj) async def _consumer(self, num=0): reset = lambda: (0, {}, time()) await sleep(0.3 * num) # So workes do not persist at the same time size, data, t0 = reset() print("Hi! I'm worker {}. And I'm ready to ROCK!".format(num)) try: while True: try: obj = await wait_for(self.queue.get(), self.timeout) data = self._process_data(data, obj) size += 1 except TimeoutError: pass if time() - t0 > self.timeout or size >= self.max_size: if data: await db.update(data) size, data, t0 = reset() except CancelledError: pass def _process_data(self, data, obj): poll_id = obj['pollResourceId'] option_id = obj['optionId'] if poll_id not in data: data[poll_id] = {} if option_id not in data[poll_id]: data[poll_id][option_id] = 1 else: data[poll_id][option_id] += 1 return data
async def simple_ws( uri: typing.AnyStr, on_open: typing.Callable = None, on_message: typing.Callable = None, on_error: typing.Callable = None, q: asyncio.Queue = None, ): """a simple WebSocket client""" def call_function(f, *args): if f: f(*args) loop = asyncio.get_event_loop() async with websockets.connect( uri, extra_headers=[ ('Cookie', 'csrftoken=9DWCWzfUyRtyYBbQJSiMhMTCNiR7ndzlu9FmlhPfiv8Sxqb5YhT7qaT8hJhWrbRw; ' 'sessionid=tn3464k1l3ykp9gi0u0fqokutgdh6bys')]) as client_side_ws: call_function(on_open) i = 0 while True: try: user_input_task = loop.create_task(q.get()) ws_recv_task = loop.create_task(client_side_ws.recv()) tasks = [user_input_task, ws_recv_task] await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) if user_input_task.done(): user_input = user_input_task.result() user_input = user_input.strip() data_dict = dict(urllib.parse.parse_qsl(user_input)) print('You enter: ', data_dict) data = json.dumps(data_dict) await client_side_ws.send(data) msg = None if ws_recv_task.done(): msg = ws_recv_task.result() call_function(on_message, msg) # Cancel remaining tasks so they do not generate errors as we exit without finishing them. for task in tasks: if not task.done(): task.cancel() if msg is not None: msg_dict: typing.Dict = json.loads(msg) message: typing.AnyStr = msg_dict.get('message') if message and message.strip() == 'bye': break except Exception as e: call_function(on_error, e) break
async def __task_candle_receiver(self, name:str, queue:asyncio.Queue)->None: # Обрабатываем свечи while not self.__estop.is_set(): # Ждем сделку flag,ticker = await wait_first([ queue.get(), self.__estop.wait() ]) if not flag: break # Сбор статистики if self.__perfomance: await self.__perfomance.add("candles") # Журнал if self.show_candles: self.logger.info(f"Candle <{name}>: {ticker}")
def bfs(self): queue = Queue() queue.put(self) while not queue.empty(): current_node = queue.get() print(current_node.value) if current_node.left_child: queue.put(current_node.left_child) if current_node.right_child: queue.put(current_node.right_child)
def input(self, fd, dst): q = Queue() def cb(): q.put_nowait(os.read(fd, 32)) self.loop.add_reader(fd, cb) try: while True: data = yield from q.get() if not data: break yield from send(dst, BYTES, data) finally: self.loop.remove_reader(fd)
class Echo(WS): def __init__(self, loop=None): self.queue = Queue(loop=loop) def get(self): return self.queue.get() def on_message(self, ws, message): self.queue.put_nowait(message) def on_ping(self, ws, body): ws.pong(body) self.queue.put_nowait('PING: %s' % body.decode('utf-8')) def on_pong(self, ws, body): self.queue.put_nowait('PONG: %s' % body.decode('utf-8')) def on_close(self, ws): self.queue.put_nowait('CLOSE')
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',text)) if urls: LOGGER.info('got %r distinct urls from %r',len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get(url, allow_redirects=False) #1 break #2 except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error else: return try: if is_redirect(response): location = response.headers['location'] else: #4 stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() #q.get() Remove and return an item from the queue. If queue is empty, wait until an item is available. #print('url',url, 'max_redirect', max_redirect) assert url in self.seen_urls #assert 断言,异常会直接抛出 yield from self.fetch(url, max_redirect) self.q.task_done() #Indicate that a formerly enqueued task is complete.表明以前排队的任务完成 except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) #put_nowait() Put an item into the queue without blocking.此句实际最先执行 @asyncio.coroutine def crawl(self): """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() #Block until all items in the queue have been gotten and processed.保持阻塞状态,直到处理了队列中的所有项目为止 self.t1 = time.time() for w in workers: w.cancel()
class SubscribeListener(SubscribeCallback): def __init__(self): self.connected = False self.connected_event = Event() self.disconnected_event = Event() self.presence_queue = Queue() self.message_queue = Queue() self.error_queue = Queue() def status(self, pubnub, status): if utils.is_subscribed_event(status) and not self.connected_event.is_set(): self.connected_event.set() elif utils.is_unsubscribed_event(status) and not self.disconnected_event.is_set(): self.disconnected_event.set() elif status.is_error(): self.error_queue.put_nowait(status.error_data.exception) def message(self, pubnub, message): self.message_queue.put_nowait(message) def presence(self, pubnub, presence): self.presence_queue.put_nowait(presence) @asyncio.coroutine def _wait_for(self, coro): scc_task = asyncio.ensure_future(coro) err_task = asyncio.ensure_future(self.error_queue.get()) yield from asyncio.wait([ scc_task, err_task ], return_when=asyncio.FIRST_COMPLETED) if err_task.done() and not scc_task.done(): if not scc_task.cancelled(): scc_task.cancel() raise err_task.result() else: if not err_task.cancelled(): err_task.cancel() return scc_task.result() @asyncio.coroutine def wait_for_connect(self): if not self.connected_event.is_set(): yield from self._wait_for(self.connected_event.wait()) else: raise Exception("instance is already connected") @asyncio.coroutine def wait_for_disconnect(self): if not self.disconnected_event.is_set(): yield from self._wait_for(self.disconnected_event.wait()) else: raise Exception("instance is already disconnected") @asyncio.coroutine def wait_for_message_on(self, *channel_names): channel_names = list(channel_names) while True: try: env = yield from self._wait_for(self.message_queue.get()) if env.channel in channel_names: return env else: continue finally: self.message_queue.task_done() @asyncio.coroutine def wait_for_presence_on(self, *channel_names): channel_names = list(channel_names) while True: try: env = yield from self._wait_for(self.presence_queue.get()) if env.channel in channel_names: return env else: continue finally: self.presence_queue.task_done()
class Messagedispatcher: def __init__(self, communicator): self.communicator = communicator self.messages = { "direct": { "status": { "class": messages.StatusDirect, "queue": Queue() }, "pinor": { "class": messages.PinorDirect, "queue": Queue() } }, "mesh": { "status": { "class": messages.StatusMesh, "queue": Queue() }, "pinor": { "class": messages.PinorMesh, "queue": Queue() }, "return": { "class": messages.ReturnMesh, "queue": Queue() }, "deploy": { "class": messages.DeployMesh, "queue": Queue() }, "grid": { "class": messages.GridMesh, "queue": Queue() } } } self.mesh_queue = Queue() @coroutine def wait_for_message(self, *types): x = self.messages for i in types: x = x[i] q = x["queue"] return (yield from q.get()) @coroutine def get_mesh_message(self): return (yield from self.mesh_queue.get()) @coroutine def startup(self): while True: meshput = False msg = yield from self.communicator.receive() if msg["type"] == "mesh": meshput = True x = self.messages x = x[msg["type"]] x = x[msg["data"]["datatype"]] q = x["queue"] c = x["class"] emsg = c.from_json(msg) yield from q.put(emsg) if meshput: # print("RECEIVE: " + str(msg) + "\n") yield from self.mesh_queue.put(emsg)
class _EventManager(object): def __init__(self): providers = {} self.__registration = {} self.__module_functions = {} self.__events = Queue() @coroutine def handleEvents(self): while True: event, args, future = yield from self.__events.get() logger.debug("Handling event {}".format(event)) for fn, expects in self.__registration[event[0]]: fire = True if len(event) - 1 != len(expects): continue for i in range(len(event)-1): ev = event[i+1].lower() ex = expects[i] if isinstance(ex, list): if not any(ev == val.lower() for val in ex): logger.error("Won't fire") fire = False break else: if ev.lower() != ex.lower(): fire = False break if fire: logger.debug("Firing event function: {} with {}".format(fn.__name__, args)) ret = fn(event=event, **args) future.set_result(ret) @coroutine def handle_event(self, event, args): logger.debug('Handling event {}'.format(event)) to_call = [] results = [] for fn, expects in self.__registration[event[0]]: fire = True if len(event) -1 != len(expects): continue for i in range(len(event)-1): ev = event[i+1].lower() ex = expects[i] if isinstance(ex, list): if not any(ev == val.lower() for val in ex): logger.error("Won't fire") fire = False break else: if ev.lower() != ex.lower(): fire = False break if fire: to_call.append(fn(event=event, **args)) if len(to_call) > 0: results = yield from gather(*to_call) return results def register_class(self, cls): methods = inspect.getmembers(cls, predicate=inspect.ismethod) for _, f in methods: fn = f event = getattr(fn, '__event__', None) if event is not None: logger.debug('Registering {} for {}'.format(fn.__name__, event)) self.register_function(event, fn) def register_function(self, event, func): primary = event[0] expects = [] if len(event) > 1: expects = event[1:] if not primary in self.__registration: self.__registration[primary] = [] self.__registration[primary].append([func, expects]) mod = sys.modules[func.__module__] if not mod in self.__module_functions: self.__module_functions[mod] = [] self.__module_functions[mod].append(func) @coroutine def fire_event(self, *event, **kwargs): results = yield from self.handle_event(event, kwargs) return results def unregisterModuleFunctions(self, mod): if not mod in self.__module_functions: return True for r in __registration: self.__registration[r][:] = [i for i,_ in self.__registration[r] if i not in self.__module_functions[mod]] del self.__module_functions[mod]
class BasePlugin(metaclass=ABCMeta): '''Core plug-in functionality A Sphinx plug-in needs to provide a minimim set of services in order to be useful. Those are defined here, with default implementations where it makes sense. ''' # This is a handle to the data bus. It's set when we are registered. _databus = None # Type manager handle _tm = None def __init__(self, runner, plugins, source = None): '''Constructor This is how our plugin pipeline is constructed. Each plugin instance is created when the input script is read, and they are chained together, from source to sink, here. This method _must_ be called with the event loop from which it will be called in the future, e.g., asyncio.get_event_loop(). ''' # A dict that maps each destination for our data, to the type that the # destination can consume. self._sinks = {} # Retain a pointer to our source, and add ourself to it's list of sinks. self._source = source if source: # Validate that we can process data from this source sink_types = set(source.sources()).intersection(self.sinks()) if len(sink_types): source._set_sink(self, sink_types.pop()) else: err = "{} cannot sink '{}'".format(self, source.sources()) _log.error(err) raise ImpedenceMismatchError(err) # Our input queue self._queue = Queue() self.runner = runner self._plugins = plugins # create_task schedules the execution of the coroutine "run", wrapped # in a future. self._task = self.runner.create_task(self.run()) def __getattr__(self, name): '''Plugin Pipeline Bulding This method is called when Python can't find a requested attribute. We use it to create a new plugin instance to add to the pipeline. ''' if name in self._plugins: return partial(self._plugins[name], source = self) else: raise AttributeError def _set_sink(self, sink, data_type): '''Register a sink Called during initialization to register a sink (destination for our output). ''' self._sinks[sink] = data_type @coroutine def publish(self, data): '''Publish data Called by a plugin to publish data to it's sinks. ''' for sink, data_type in self._sinks.items(): # Special case 'None', since that's our 'eof'. See the 'done' # method below. if data: data = self.xform_data(data, data_type) yield from self._databus.publish(data, sink) @coroutine def write_data(self, data): '''Write data to queue Called by the databus controller to enqueue data from our source. ''' yield from self._queue.put(data) @coroutine def read_data(self): '''Read data from queue Called by plugins to get data from their sources. ''' payload = yield from self._queue.get() return payload @coroutine def done(self): '''The plugin is finished Called by a plugin to indicate to it's sinks that it has no more data. ''' # TODO: It feels clumsy to use getting "None" as "EOT". Also, it # requires that the plugins test for it to stop reading data. yield from self.publish(None) # Sources and sinks, oh my! These follow the current flow analogy. # Data flows from a source to a sink. Our input comes from a source, # and we sink it, process the data in some manner, and then source # it to the next plugin in the pipeline. @classmethod def sinks(cls): '''Sink types These are an array of types that we sink, i.e., read. ''' return [] @classmethod def sources(cls): '''Source types These are an array of types that we source, i.e., write. ''' return [] @classmethod def set_databus(cls, db): '''A handler to the Semantic Databus This gets set when the plug-in is registered. ''' cls._databus = db cls._tm = db._typemgr @classmethod def script_name(cls): '''Return the plug-in's script name. The script name is how the plug-in is referred to by command scripts. ''' pass @abstractmethod def xform_data(self, data, to_type): '''Transform data to a specific type This method must be able to transform the input, 'data', to the 'to_type'. The plugin will only be responsible for transforming types that are specified in our "sources" method. There is no expectation on how the plugin represents 'data', but it would make sense to do so in some manner that is not only natural for the plugin, but also easily transformed. ''' pass @coroutine @abstractmethod def run(self): '''Our main method where work happens This is the method that will be invoked when the plug-in needs to do some work. ''' pass
class ProxyResponse(object): '''Asynchronous wsgi response. ''' _started = False _headers = None _done = False def __init__(self, environ, start_response): self._loop = environ['pulsar.connection']._loop self.environ = environ self.start_response = start_response self.queue = Queue() def __iter__(self): while True: if self._done: try: yield self.queue.get_nowait() except QueueEmpty: break else: yield async(self.queue.get(), loop=self._loop) def pre_request(self, response, exc=None): self._started = True response.bind_event('data_processed', self.data_processed) return response def error(self, exc): if not self._started: request = wsgi.WsgiRequest(self.environ) content_type = request.content_types.best_match( ('text/html', 'text/plain')) uri = self.environ['RAW_URI'] msg = 'Could not find %s' % uri logger.info(msg=msg) if content_type == 'text/html': html = wsgi.HtmlDocument(title=msg) html.body.append('<h1>%s</h1>' % msg) data = html.render() resp = wsgi.WsgiResponse(504, data, content_type='text/html') elif content_type == 'text/plain': resp = wsgi.WsgiResponse(504, msg, content_type='text/html') else: resp = wsgi.WsgiResponse(504, '') self.start_response(resp.status, resp.get_headers()) self._done = True self.queue.put_nowait(resp.content[0]) def data_processed(self, response, exc=None, **kw): '''Receive data from the requesting HTTP client.''' status = response.get_status() if status == '100 Continue': stream = self.environ.get('wsgi.input') or io.BytesIO() body = yield stream.read() response.transport.write(body) if response.parser.is_headers_complete(): if self._headers is None: headers = self.remove_hop_headers(response.headers) self._headers = Headers(headers, kind='server') # start the response self.start_response(status, list(self._headers)) body = response.recv_body() if response.parser.is_message_complete(): self._done = True self.queue.put_nowait(body) def remove_hop_headers(self, headers): for header, value in headers: if header.lower() not in wsgi.HOP_HEADERS: yield header, value
class BrokerProtocolHandler(ProtocolHandler): def __init__(self, plugins_manager: PluginManager, session: Session=None, loop=None): super().__init__(plugins_manager, session, loop) self._disconnect_waiter = None self._pending_subscriptions = Queue(loop=self._loop) self._pending_unsubscriptions = Queue(loop=self._loop) @asyncio.coroutine def start(self): yield from super().start() if self._disconnect_waiter is None: self._disconnect_waiter = futures.Future(loop=self._loop) @asyncio.coroutine def stop(self): yield from super().stop() if self._disconnect_waiter is not None and not self._disconnect_waiter.done(): self._disconnect_waiter.set_result(None) @asyncio.coroutine def wait_disconnect(self): return (yield from self._disconnect_waiter) def handle_write_timeout(self): pass def handle_read_timeout(self): if self._disconnect_waiter is not None and not self._disconnect_waiter.done(): self._disconnect_waiter.set_result(None) @asyncio.coroutine def handle_disconnect(self, disconnect): self.logger.debug("Client disconnecting") if self._disconnect_waiter and not self._disconnect_waiter.done(): self.logger.debug("Setting waiter result to %r" % disconnect) self._disconnect_waiter.set_result(disconnect) @asyncio.coroutine def handle_connection_closed(self): yield from self.handle_disconnect(None) @asyncio.coroutine def handle_connect(self, connect: ConnectPacket): # Broker handler shouldn't received CONNECT message during messages handling # as CONNECT messages are managed by the broker on client connection self.logger.error('%s [MQTT-3.1.0-2] %s : CONNECT message received during messages handling' % (self.session.client_id, format_client_message(self.session))) if self._disconnect_waiter is not None and not self._disconnect_waiter.done(): self._disconnect_waiter.set_result(None) @asyncio.coroutine def handle_pingreq(self, pingreq: PingReqPacket): yield from self._send_packet(PingRespPacket.build()) @asyncio.coroutine def handle_subscribe(self, subscribe: SubscribePacket): subscription = {'packet_id': subscribe.variable_header.packet_id, 'topics': subscribe.payload.topics} yield from self._pending_subscriptions.put(subscription) @asyncio.coroutine def handle_unsubscribe(self, unsubscribe: UnsubscribePacket): unsubscription = {'packet_id': unsubscribe.variable_header.packet_id, 'topics': unsubscribe.payload.topics} yield from self._pending_unsubscriptions.put(unsubscription) @asyncio.coroutine def get_next_pending_subscription(self): subscription = yield from self._pending_subscriptions.get() return subscription @asyncio.coroutine def get_next_pending_unsubscription(self): unsubscription = yield from self._pending_unsubscriptions.get() return unsubscription @asyncio.coroutine def mqtt_acknowledge_subscription(self, packet_id, return_codes): suback = SubackPacket.build(packet_id, return_codes) yield from self._send_packet(suback) @asyncio.coroutine def mqtt_acknowledge_unsubscription(self, packet_id): unsuback = UnsubackPacket.build(packet_id) yield from self._send_packet(unsuback) @asyncio.coroutine def mqtt_connack_authorize(self, authorize: bool): if authorize: connack = ConnackPacket.build(self.session.parent, CONNECTION_ACCEPTED) else: connack = ConnackPacket.build(self.session.parent, NOT_AUTHORIZED) yield from self._send_packet(connack) @classmethod @asyncio.coroutine def init_from_connect(cls, reader: ReaderAdapter, writer: WriterAdapter, plugins_manager, loop=None): """ :param reader: :param writer: :param plugins_manager: :param loop: :return: """ remote_address, remote_port = writer.get_peer_info() connect = yield from ConnectPacket.from_stream(reader) yield from plugins_manager.fire_event(EVENT_MQTT_PACKET_RECEIVED, packet=connect) if connect.payload.client_id is None: raise MQTTException('[[MQTT-3.1.3-3]] : Client identifier must be present' ) if connect.variable_header.will_flag: if connect.payload.will_topic is None or connect.payload.will_message is None: raise MQTTException('will flag set, but will topic/message not present in payload') if connect.variable_header.reserved_flag: raise MQTTException('[MQTT-3.1.2-3] CONNECT reserved flag must be set to 0') if connect.proto_name != "MQTT": raise MQTTException('[MQTT-3.1.2-1] Incorrect protocol name: "%s"' % connect.proto_name) connack = None error_msg = None if connect.proto_level != 4: # only MQTT 3.1.1 supported error_msg = 'Invalid protocol from %s: %d' % \ (format_client_message(address=remote_address, port=remote_port), connect.proto_level) connack = ConnackPacket.build(0, UNACCEPTABLE_PROTOCOL_VERSION) # [MQTT-3.2.2-4] session_parent=0 elif not connect.username_flag and connect.password_flag: connack = ConnackPacket.build(0, BAD_USERNAME_PASSWORD) # [MQTT-3.1.2-22] elif connect.username_flag and not connect.password_flag: connack = ConnackPacket.build(0, BAD_USERNAME_PASSWORD) # [MQTT-3.1.2-22] elif connect.username_flag and connect.username is None: error_msg = 'Invalid username from %s' % \ (format_client_message(address=remote_address, port=remote_port)) connack = ConnackPacket.build(0, BAD_USERNAME_PASSWORD) # [MQTT-3.2.2-4] session_parent=0 elif connect.password_flag and connect.password is None: error_msg = 'Invalid password %s' % (format_client_message(address=remote_address, port=remote_port)) connack = ConnackPacket.build(0, BAD_USERNAME_PASSWORD) # [MQTT-3.2.2-4] session_parent=0 elif connect.clean_session_flag is False and (connect.payload.client_id is None or connect.payload.client_id == ""): error_msg = '[MQTT-3.1.3-8] [MQTT-3.1.3-9] %s: No client Id provided (cleansession=0)' % \ format_client_message(address=remote_address, port=remote_port) connack = ConnackPacket.build(0, IDENTIFIER_REJECTED) if connack is not None: yield from plugins_manager.fire_event(EVENT_MQTT_PACKET_SENT, packet=connack) yield from connack.to_stream(writer) yield from writer.close() raise MQTTException(error_msg) incoming_session = Session(loop) incoming_session.client_id = connect.client_id incoming_session.clean_session = connect.clean_session_flag incoming_session.will_flag = connect.will_flag incoming_session.will_retain = connect.will_retain_flag incoming_session.will_qos = connect.will_qos incoming_session.will_topic = connect.will_topic incoming_session.will_message = connect.will_message incoming_session.username = connect.username incoming_session.password = connect.password if connect.keep_alive > 0: incoming_session.keep_alive = connect.keep_alive else: incoming_session.keep_alive = 0 handler = cls(plugins_manager, loop=loop) return handler, incoming_session
class Cloner(object): def __init__(self, root): self.visited_urls = [] self.root = self.add_scheme(root) if len(self.root.host) < 4: sys.exit('invalid taget {}'.format(self.root.host)) self.target_path = '/opt/snare/pages/{}'.format(self.root.host) if not os.path.exists(self.target_path): os.mkdir(self.target_path) self.new_urls = Queue() @staticmethod def add_scheme(url): if yarl.URL(url).scheme: new_url = yarl.URL(url) else: new_url = yarl.URL('http://' + url) return new_url @asyncio.coroutine def process_link(self, url, check_host=False): url = yarl.URL(url) if check_host: if (url.host != self.root.host or url.fragment or url in self.visited_urls): return None if not url.is_absolute(): url = self.root.join(url) yield from self.new_urls.put(url) return url.relative().human_repr() @asyncio.coroutine def replace_links(self, data): soup = BeautifulSoup(data, 'html.parser') # find all relative links for link in soup.findAll(href=True): res = yield from self.process_link(link['href'], check_host=True) if res is not None: link['href'] = res # find all images and scripts for elem in soup.findAll(src=True): res = yield from self.process_link(elem['src']) if res is not None: elem['src'] = res # find all action elements for act_link in soup.findAll(action=True): res = yield from self.process_link(act_link['action']) if res is not None: act_link['action'] = res # prevent redirects for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}): redir['value'] = yarl.URL(redir['value']).relative().human_repr() return soup @asyncio.coroutine def get_body(self): while not self.new_urls.empty(): current_url = yield from self.new_urls.get() if current_url in self.visited_urls: continue self.visited_urls.append(current_url) if current_url.name: file_name = current_url.name elif current_url.raw_path != '/': file_name = current_url.path.rsplit('/')[1] else: file_name = 'index.html' file_path = os.path.dirname(current_url.path) if file_path == '/': file_path = self.target_path else: file_path = os.path.join(self.target_path, file_path[1:]) print('path: ', file_path, 'name: ', file_name) if file_path and not os.path.exists(file_path): os.makedirs(file_path) data = None try: with aiohttp.Timeout(10.0): with aiohttp.ClientSession() as session: response = yield from session.get(current_url) data = yield from response.read() except aiohttp.ClientError as client_error: print(client_error) else: response.release() session.close() if data is not None: if re.match(re.compile('.*\.(html|php)'), file_name): soup = yield from self.replace_links(data) data = str(soup).encode() with open(os.path.join(file_path, file_name), 'wb') as index_fh: index_fh.write(data) if '.css' in file_name: css = cssutils.parseString(data) for carved_url in cssutils.getUrls(css): if carved_url.startswith('data'): continue carved_url = yarl.URL(carved_url) if not carved_url.is_absolute(): carved_url = self.root.join(carved_url) if carved_url not in self.visited_urls: yield from self.new_urls.put(carved_url) @asyncio.coroutine def run(self): yield from self.new_urls.put(self.root) return (yield from self.get_body())
class Crawler(object): """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__( self, roots, scraper=None, data_handler=None, exclude=None, strict=True, # What to crawl. max_redirect=5, max_tries=10, # Per-url limits. max_tasks=10, max_connections_per_host=3, *, loop=None ): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.max_connections_per_host = max_connections_per_host self.scraper = scraper self.data_handler = data_handler self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r"\A[\d\.]*\Z", host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_urls(root) self.t0 = time.time() self.t1 = None def record_statistic( self, url=None, next_url=None, status=None, exception=None, content_type=None, encoding=None, num_urls=0, num_new_urls=0, ): """Record the FetchStatistic for completed / failed URL.""" fetch_statistic = FetchStatistic( url=url, next_url=next_url, status=status, size=0, exception=exception, content_type=content_type, encoding=encoding, num_urls=num_urls, num_new_urls=num_new_urls, ) self.done.append(fetch_statistic) def extract_data(self, root_url, html): raise NotImplementedError("You need to define a extract_data method!") def close(self): """Close resources.""" LOGGER.debug("closing resources") self.session.close() @asyncio.coroutine def parse_links(self, web_page_html, base_url, _content_type, _encoding): """Return a list of links.""" links = set() tree = html.fromstring(web_page_html) tree.make_links_absolute(base_url) urls = [link[2] for link in tree.iterlinks()] for url in urls: defragmented, frag = urllib.parse.urldefrag(url) if verify.url_allowed( defragmented, self.root_domains, exclude=self.exclude ): # Select Valid links, testing against regexp and root_domains links.add(defragmented) if urls: LOGGER.info( "got %r urls from %r new links: %i visited: %i", len(urls), base_url, len(links - self.seen_urls), len(self.seen_urls), ) new_links = [link for link in links.difference(self.seen_urls)] self.record_statistic( url=base_url, content_type=_content_type, encoding=_encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls), ) return new_links def handle_redirect(self, response, url, max_redirect): location = response.headers["location"] next_url = urllib.parse.urljoin(url, location) self.record_statistic(url=url, next_url=next_url, status=response.status) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info("redirect to %r from %r max_redir: %i", next_url, url, max_redirect - 1) self.add_urls(next_url, max_redirect - 1) else: LOGGER.error("redirect limit reached for %r from %r", next_url, url) return @asyncio.coroutine def fetch(self, url, max_redirect, sem): """Fetch one URL.""" tries = 0 web_page = None exception = None _url = None _encoding = None _content_type = None sleep_time = 0 while tries < self.max_tries: try: with (yield from sem): response = yield from asyncio.wait_for( self.session.get(url, allow_redirects=False), 10, loop=self.loop ) if tries > 1: LOGGER.debug("try %r for %r success", tries, url) break except Exception as client_error: sleep_time += 5 yield from asyncio.sleep(sleep_time) LOGGER.error("try %r for %r raised %r", tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error("%r failed after %r tries", url, self.max_tries) self.record_statistic(url=url, exception=exception) return (web_page, _url, _content_type, _encoding) try: _url, _content_type, _encoding = get_content_type_and_encoding(response) if is_redirect(response): self.handle_redirect(response, url, max_redirect) web_page = "redirect" elif response.status == 200 and _content_type in ("text/html", "application/xml"): web_page = yield from response.text() else: self.record_statistic( url=response.url, status=response.status, content_type=_content_type, encoding=_encoding ) except Exception as e: print("*******error**********") finally: yield from response.release() return (web_page, _url, _content_type, _encoding) def add_urls(self, urls, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect if not isinstance(urls, str): urls = set(urls) for link in urls.difference(self.seen_urls): self.q.put_nowait((link, max_redirect)) self.seen_urls.update(urls) elif urls not in self.seen_urls: self.q.put_nowait((urls, max_redirect)) self.seen_urls.add(urls) @asyncio.coroutine def work(self, sem): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() # assert url in self.seen_urls web_page, url, content_type, encoding = yield from self.fetch(url, max_redirect, sem) if web_page and web_page != "redirect": new_links = yield from self.parse_links(web_page, url, content_type, encoding) if self.scraper: data = self.scraper.scrape(url, web_page) if self.data_handler: self.data_handler.handle(data) self.add_urls(new_links) self.q.task_done() except (asyncio.CancelledError,): print("error") @asyncio.coroutine def crawl(self): sem = asyncio.Semaphore(value=self.max_connections_per_host, loop=self.loop) """Run the crawler until all finished.""" LOGGER.info("Starting crawl...") workers = [asyncio.Task(self.work(sem), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. self.session = aiohttp.ClientSession(loop=loop) # Put (URL, max_redirect) in the Queue self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): '''Run the crawler untill all work is done.''' workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] # When all work is done, exit. yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_redirect = yield from self.q.get() # Download page and add new links to self.q yield from self.fetch(url, max_redirect) self.q.task_done() @asyncio.coroutine def fetch(self, url, max_redirect): # Handle redirects ourselves. response = yield from self.session.get( url, allow_redirects=False) try: if is_redirect(response): if max_redirect > 0: next_url = response.headers['location'] if next_url in self.seen_urls: # We have done this before. return # Remember we have seen this url. self.seen_urls.add(next_url) # Follow the redirect. One less redirect remains. self.q.put_nowait((next_url, max_redirect -1)) else: links = yield from self.parse_links(response) # Python set-logic: for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: # Return connection to pool. yield from response.release()