class AsyncModbusGeneratorClient(AsyncModbusSerialClient): def __init__(self, method='ascii', **kwargs): super(AsyncModbusGeneratorClient, self).__init__(method=method, **kwargs) self.sem = Semaphore(1) @gen.coroutine def read_input_registers(self, address, count=1, **kwargs): fut_result = Future() request = ReadInputRegistersRequest(address, count, **kwargs) yield self.sem.acquire() try: res = self.execute(request) res.addCallback(fut_result.set_result) yield fut_result finally: self.sem.release() raise gen.Return(fut_result.result()) @gen.coroutine def read_holding_registers(self, address, count=1, **kwargs): fut_result = Future() request = ReadHoldingRegistersRequest(address, count, **kwargs) yield self.sem.acquire() try: res = self.execute(request) res.addCallback(fut_result.set_result) yield fut_result finally: self.sem.release() raise gen.Return(fut_result.result()) @gen.coroutine def write_coil(self, address, value, **kwargs): fut_result = Future() request = WriteSingleCoilRequest(address, value, **kwargs) yield self.sem.acquire() try: res = self.execute(request) res.addCallback(fut_result.set_result) yield fut_result finally: self.sem.release() raise gen.Return(fut_result.result()) @gen.coroutine def write_register(self, address, value, **kwargs): fut_result = Future() request = WriteSingleRegisterRequest(address, value, **kwargs) yield self.sem.acquire() try: res = self.execute(request) res.addCallback(fut_result.set_result) yield fut_result finally: self.sem.release() raise gen.Return(fut_result.result())
class TornadoSubscriptionManager(SubscriptionManager): def __init__(self, pubnub_instance): subscription_manager = self self._message_queue = Queue() self._consumer_event = Event() self._cancellation_event = Event() self._subscription_lock = Semaphore(1) # self._current_request_key_object = None self._heartbeat_periodic_callback = None self._reconnection_manager = TornadoReconnectionManager(pubnub_instance) super(TornadoSubscriptionManager, self).__init__(pubnub_instance) self._start_worker() class TornadoReconnectionCallback(ReconnectionCallback): def on_reconnect(self): subscription_manager.reconnect() pn_status = PNStatus() pn_status.category = PNStatusCategory.PNReconnectedCategory pn_status.error = False subscription_manager._subscription_status_announced = True subscription_manager._listener_manager.announce_status(pn_status) self._reconnection_listener = TornadoReconnectionCallback() self._reconnection_manager.set_reconnection_listener(self._reconnection_listener) def _set_consumer_event(self): self._consumer_event.set() def _message_queue_put(self, message): self._message_queue.put(message) def _start_worker(self): self._consumer = TornadoSubscribeMessageWorker(self._pubnub, self._listener_manager, self._message_queue, self._consumer_event) run = stack_context.wrap(self._consumer.run) self._pubnub.ioloop.spawn_callback(run) def reconnect(self): self._should_stop = False self._pubnub.ioloop.spawn_callback(self._start_subscribe_loop) # self._register_heartbeat_timer() def disconnect(self): self._should_stop = True self._stop_heartbeat_timer() self._stop_subscribe_loop() @tornado.gen.coroutine def _start_subscribe_loop(self): self._stop_subscribe_loop() yield self._subscription_lock.acquire() self._cancellation_event.clear() combined_channels = self._subscription_state.prepare_channel_list(True) combined_groups = self._subscription_state.prepare_channel_group_list(True) if len(combined_channels) == 0 and len(combined_groups) == 0: return envelope_future = Subscribe(self._pubnub) \ .channels(combined_channels).channel_groups(combined_groups) \ .timetoken(self._timetoken).region(self._region) \ .filter_expression(self._pubnub.config.filter_expression) \ .cancellation_event(self._cancellation_event) \ .future() canceller_future = self._cancellation_event.wait() wi = tornado.gen.WaitIterator(envelope_future, canceller_future) # iterates 2 times: one for result one for cancelled while not wi.done(): try: result = yield wi.next() except Exception as e: # TODO: verify the error will not be eaten logger.error(e) raise else: if wi.current_future == envelope_future: e = result elif wi.current_future == canceller_future: return else: raise Exception("Unexpected future resolved: %s" % str(wi.current_future)) if e.is_error(): # 599 error doesn't works - tornado use this status code # for a wide range of errors, for ex: # HTTP Server Error (599): [Errno -2] Name or service not known if e.status is not None and e.status.category == PNStatusCategory.PNTimeoutCategory: self._pubnub.ioloop.spawn_callback(self._start_subscribe_loop) return logger.error("Exception in subscribe loop: %s" % str(e)) if e.status is not None and e.status.category == PNStatusCategory.PNAccessDeniedCategory: e.status.operation = PNOperationType.PNUnsubscribeOperation self._listener_manager.announce_status(e.status) self._reconnection_manager.start_polling() self.disconnect() return else: self._handle_endpoint_call(e.result, e.status) self._pubnub.ioloop.spawn_callback(self._start_subscribe_loop) finally: self._cancellation_event.set() yield tornado.gen.moment self._subscription_lock.release() self._cancellation_event.clear() break def _stop_subscribe_loop(self): if self._cancellation_event is not None and not self._cancellation_event.is_set(): self._cancellation_event.set() def _stop_heartbeat_timer(self): if self._heartbeat_periodic_callback is not None: self._heartbeat_periodic_callback.stop() def _register_heartbeat_timer(self): super(TornadoSubscriptionManager, self)._register_heartbeat_timer() self._heartbeat_periodic_callback = PeriodicCallback( stack_context.wrap(self._perform_heartbeat_loop), self._pubnub.config.heartbeat_interval * TornadoSubscriptionManager.HEARTBEAT_INTERVAL_MULTIPLIER, self._pubnub.ioloop) self._heartbeat_periodic_callback.start() @tornado.gen.coroutine def _perform_heartbeat_loop(self): if self._heartbeat_call is not None: # TODO: cancel call pass cancellation_event = Event() state_payload = self._subscription_state.state_payload() presence_channels = self._subscription_state.prepare_channel_list(False) presence_groups = self._subscription_state.prepare_channel_group_list(False) if len(presence_channels) == 0 and len(presence_groups) == 0: return try: envelope = yield self._pubnub.heartbeat() \ .channels(presence_channels) \ .channel_groups(presence_groups) \ .state(state_payload) \ .cancellation_event(cancellation_event) \ .future() heartbeat_verbosity = self._pubnub.config.heartbeat_notification_options if envelope.status.is_error: if heartbeat_verbosity == PNHeartbeatNotificationOptions.ALL or \ heartbeat_verbosity == PNHeartbeatNotificationOptions.ALL: self._listener_manager.announce_status(envelope.status) else: if heartbeat_verbosity == PNHeartbeatNotificationOptions.ALL: self._listener_manager.announce_status(envelope.status) except PubNubTornadoException: pass # TODO: check correctness # if e.status is not None and e.status.category == PNStatusCategory.PNTimeoutCategory: # self._start_subscribe_loop() # else: # self._listener_manager.announce_status(e.status) except Exception as e: print(e) finally: cancellation_event.set() @tornado.gen.coroutine def _send_leave(self, unsubscribe_operation): envelope = yield Leave(self._pubnub) \ .channels(unsubscribe_operation.channels) \ .channel_groups(unsubscribe_operation.channel_groups).future() self._listener_manager.announce_status(envelope.status)
class KernelPool(object): ''' A class to maintain a pool of kernel and control access to the individual kernels. Kernels are protected by a borrower/lender pattern. ''' def __init__(self, prespawn_count, kernel_manager): if prespawn_count is None: prespawn_count = 0 self.kernel_clients = {} self.on_recv_funcs = {} self.kernel_manager = kernel_manager self.pool_index = 0 self.kernel_pool = [] self.kernel_semaphore = Semaphore(prespawn_count) for _ in range(prespawn_count): if self.kernel_manager.parent.seed_notebook: kernel_id = kernel_manager.start_kernel(kernel_name=self.kernel_manager.parent.seed_notebook['metadata']['kernelspec']['name']) else: kernel_id = kernel_manager.start_kernel() self.kernel_clients[kernel_id] = kernel_manager.get_kernel(kernel_id).client() self.kernel_pool.append(kernel_id) iopub = self.kernel_manager.connect_iopub(kernel_id) iopub.on_recv(self.create_on_reply(kernel_id)) @gen.coroutine def acquire(self): ''' Returns a kernel client and id for use and removes the kernel the resource pool. Kernels must be returned via the release method. :return:Returns a kernel client and a kernel id ''' yield self.kernel_semaphore.acquire() kernel_id = self.kernel_pool[0] del self.kernel_pool[0] raise gen.Return((self.kernel_clients[kernel_id], kernel_id)) def release(self, kernel_id): ''' Returns a kernel back to the resource pool. :param kernel_id: Id of the kernel to return to the pool ''' self.kernel_pool.append(kernel_id) self.kernel_semaphore.release() def _on_reply(self, kernel_id, msg_list): idents, msg_list = self.kernel_clients[kernel_id].session.feed_identities(msg_list) msg = self.kernel_clients[kernel_id].session.deserialize(msg_list) self.on_recv_funcs[kernel_id](msg) def create_on_reply(self, kernel_id): ''' The lambda is used to handle a specific reply per kernel and provide a unique stack scope per invocation. ''' return lambda msg_list: self._on_reply(kernel_id, msg_list) def on_recv(self, kernel_id, func): ''' Registers a callback for io_pub messages for a particular kernel. This is needed to avoid having multiple callbacks per kernel client. :param kernel_id: Id of the kernel :param func: Callback function to handle the message ''' self.on_recv_funcs[kernel_id] = func def shutdown(self): ''' Shuts down all kernels in the pool and in the kernel manager. ''' for kid in self.kernel_clients: self.kernel_clients[kid].stop_channels() self.kernel_manager.shutdown_kernel(kid, now=True) # Any remaining kernels that were not created for our pool should be shutdown kids = self.kernel_manager.list_kernel_ids() for kid in kids: self.kernel_manager.shutdown_kernel(kid, now=True)
class ManagedKernelPool(KernelPool): ''' Spawns a pool of kernels. Manages access to individual kernels using a borrower/lender pattern. Cleans them all up when shut down. ''' def __init__(self, prespawn_count, kernel_manager): # Make sure there's at least one kernel as a delegate if not prespawn_count: prespawn_count = 1 super(ManagedKernelPool, self).__init__(prespawn_count, kernel_manager) self.kernel_clients = {} self.on_recv_funcs = {} self.pool_index = 0 self.kernel_pool = [] kernel_ids = self.kernel_manager.list_kernel_ids() self.kernel_semaphore = Semaphore(len(kernel_ids)) # Connect to any prespawned kernels for kernel_id in kernel_ids: self.kernel_clients[kernel_id] = kernel_manager.get_kernel(kernel_id).client() self.kernel_pool.append(kernel_id) iopub = self.kernel_manager.connect_iopub(kernel_id) iopub.on_recv(self.create_on_reply(kernel_id)) @gen.coroutine def acquire(self): ''' Returns a kernel client and id for use and removes the kernel the resource pool. Kernels must be returned via the release method. :return: Returns a kernel client and a kernel id ''' yield self.kernel_semaphore.acquire() kernel_id = self.kernel_pool[0] del self.kernel_pool[0] raise gen.Return((self.kernel_clients[kernel_id], kernel_id)) def release(self, kernel_id): ''' Returns a kernel back to the resource pool. :param kernel_id: Id of the kernel to return to the pool ''' self.kernel_pool.append(kernel_id) self.kernel_semaphore.release() def _on_reply(self, kernel_id, msg_list): idents, msg_list = self.kernel_clients[kernel_id].session.feed_identities(msg_list) msg = self.kernel_clients[kernel_id].session.deserialize(msg_list) self.on_recv_funcs[kernel_id](msg) def create_on_reply(self, kernel_id): ''' The lambda is used to handle a specific reply per kernel and provide a unique stack scope per invocation. ''' return lambda msg_list: self._on_reply(kernel_id, msg_list) def on_recv(self, kernel_id, func): ''' Registers a callback for io_pub messages for a particular kernel. This is needed to avoid having multiple callbacks per kernel client. :param kernel_id: Id of the kernel :param func: Callback function to handle the message ''' self.on_recv_funcs[kernel_id] = func def shutdown(self): ''' Shuts down all kernels in the pool and in the kernel manager. ''' for kid in self.kernel_clients: self.kernel_clients[kid].stop_channels() self.kernel_manager.shutdown_kernel(kid, now=True) # Any remaining kernels that were not created for our pool should be shutdown super(ManagedKernelPool, self).shutdown()
class ManagedKernelPool(KernelPool): """Spawns a pool of kernels that are treated as identical delegates for future requests. Manages access to individual kernels using a borrower/lender pattern. Cleans them all up when shut down. Parameters ---------- prespawn_count Number of kernels to spawn immediately kernel_manager Kernel manager instance Attributes ---------- kernel_clients : dict Map of kernel IDs to client instances for communicating with them on_recv_funcs : dict Map of kernel IDs to iopub callback functions kernel_pool : list List of available delegate kernel IDs kernel_semaphore : tornado.locks.Semaphore Semaphore that controls access to the kernel pool """ def __init__(self, prespawn_count, kernel_manager): # Make sure there's at least one kernel as a delegate if not prespawn_count: prespawn_count = 1 super(ManagedKernelPool, self).__init__(prespawn_count, kernel_manager) self.kernel_clients = {} self.on_recv_funcs = {} self.kernel_pool = [] kernel_ids = self.kernel_manager.list_kernel_ids() self.kernel_semaphore = Semaphore(len(kernel_ids)) # Create clients and iopub handlers for prespawned kernels for kernel_id in kernel_ids: self.kernel_clients[kernel_id] = kernel_manager.get_kernel( kernel_id).client() self.kernel_pool.append(kernel_id) iopub = self.kernel_manager.connect_iopub(kernel_id) iopub.on_recv(self.create_on_reply(kernel_id)) @gen.coroutine def acquire(self): """Gets a kernel client and removes it from the available pool of clients. Returns ------- tuple Kernel client instance, kernel ID """ yield self.kernel_semaphore.acquire() kernel_id = self.kernel_pool[0] del self.kernel_pool[0] raise gen.Return((self.kernel_clients[kernel_id], kernel_id)) def release(self, kernel_id): """Puts a kernel back into the pool of kernels available to handle requests. Parameters ---------- kernel_id : str Kernel to return to the pool """ self.kernel_pool.append(kernel_id) self.kernel_semaphore.release() def _on_reply(self, kernel_id, msg_list): """Invokes the iopub callback registered for the `kernel_id` and passes it a deserialized list of kernel messsages. Parameters ---------- kernel_id : str Kernel that sent the reply msg_list : list List of 0mq messages """ idents, msg_list = self.kernel_clients[ kernel_id].session.feed_identities(msg_list) msg = self.kernel_clients[kernel_id].session.deserialize(msg_list) self.on_recv_funcs[kernel_id](msg) def create_on_reply(self, kernel_id): """Creates an anonymous function to handle reply messages from the kernel. Parameters ---------- kernel_id Kernel to listen to Returns ------- function Callback function taking a kernel ID and 0mq message list """ return lambda msg_list: self._on_reply(kernel_id, msg_list) def on_recv(self, kernel_id, func): """Registers a callback function for iopub messages from a particular kernel. This is needed to avoid having multiple callbacks per kernel client. Parameters ---------- kernel_id Kernel from which to receive iopub messages func Callback function to use for kernel iopub messages """ self.on_recv_funcs[kernel_id] = func def shutdown(self): """Shuts down all kernels and their clients. """ for kid in self.kernel_clients: self.kernel_clients[kid].stop_channels() self.kernel_manager.shutdown_kernel(kid, now=True) # Any remaining kernels that were not created for our pool should be shutdown super(ManagedKernelPool, self).shutdown()
class ManagedKernelPool(KernelPool): """Spawns a pool of kernels that are treated as identical delegates for future requests. Manages access to individual kernels using a borrower/lender pattern. Cleans them all up when shut down. Parameters ---------- prespawn_count Number of kernels to spawn immediately kernel_manager Kernel manager instance Attributes ---------- kernel_clients : dict Map of kernel IDs to client instances for communicating with them on_recv_funcs : dict Map of kernel IDs to iopub callback functions kernel_pool : list List of available delegate kernel IDs kernel_semaphore : tornado.locks.Semaphore Semaphore that controls access to the kernel pool """ def __init__(self, prespawn_count, kernel_manager): # Make sure there's at least one kernel as a delegate if not prespawn_count: prespawn_count = 1 super(ManagedKernelPool, self).__init__(prespawn_count, kernel_manager) self.kernel_clients = {} self.on_recv_funcs = {} self.kernel_pool = [] kernel_ids = self.kernel_manager.list_kernel_ids() self.kernel_semaphore = Semaphore(len(kernel_ids)) # Create clients and iopub handlers for prespawned kernels for kernel_id in kernel_ids: self.kernel_clients[kernel_id] = kernel_manager.get_kernel(kernel_id).client() self.kernel_pool.append(kernel_id) iopub = self.kernel_manager.connect_iopub(kernel_id) iopub.on_recv(self.create_on_reply(kernel_id)) @gen.coroutine def acquire(self): """Gets a kernel client and removes it from the available pool of clients. Returns ------- tuple Kernel client instance, kernel ID """ yield self.kernel_semaphore.acquire() kernel_id = self.kernel_pool[0] del self.kernel_pool[0] raise gen.Return((self.kernel_clients[kernel_id], kernel_id)) def release(self, kernel_id): """Puts a kernel back into the pool of kernels available to handle requests. Parameters ---------- kernel_id : str Kernel to return to the pool """ self.kernel_pool.append(kernel_id) self.kernel_semaphore.release() def _on_reply(self, kernel_id, msg_list): """Invokes the iopub callback registered for the `kernel_id` and passes it a deserialized list of kernel messsages. Parameters ---------- kernel_id : str Kernel that sent the reply msg_list : list List of 0mq messages """ idents, msg_list = self.kernel_clients[kernel_id].session.feed_identities(msg_list) msg = self.kernel_clients[kernel_id].session.deserialize(msg_list) self.on_recv_funcs[kernel_id](msg) def create_on_reply(self, kernel_id): """Creates an anonymous function to handle reply messages from the kernel. Parameters ---------- kernel_id Kernel to listen to Returns ------- function Callback function taking a kernel ID and 0mq message list """ return lambda msg_list: self._on_reply(kernel_id, msg_list) def on_recv(self, kernel_id, func): """Registers a callback function for iopub messages from a particular kernel. This is needed to avoid having multiple callbacks per kernel client. Parameters ---------- kernel_id Kernel from which to receive iopub messages func Callback function to use for kernel iopub messages """ self.on_recv_funcs[kernel_id] = func def shutdown(self): """Shuts down all kernels and their clients. """ for kid in self.kernel_clients: self.kernel_clients[kid].stop_channels() self.kernel_manager.shutdown_kernel(kid, now=True) # Any remaining kernels that were not created for our pool should be shutdown super(ManagedKernelPool, self).shutdown()
class TornadoSubscriptionManager(SubscriptionManager): def __init__(self, pubnub_instance): self._message_queue = Queue() self._consumer_event = Event() self._subscription_lock = Semaphore(1) # self._current_request_key_object = None self._heartbeat_periodic_callback = None self._cancellation_event = None super(TornadoSubscriptionManager, self).__init__(pubnub_instance) self._start_worker() def _set_consumer_event(self): self._consumer_event.set() def _message_queue_put(self, message): self._message_queue.put(message) def _start_worker(self): self._consumer = TornadoSubscribeMessageWorker(self._pubnub, self._listener_manager, self._message_queue, self._consumer_event) run = stack_context.wrap(self._consumer.run) self._pubnub.ioloop.spawn_callback(run) def reconnect(self): self._should_stop = False self._pubnub.ioloop.add_callback(self._start_subscribe_loop) self._register_heartbeat_timer() @tornado.gen.coroutine def _start_subscribe_loop(self): try: self._stop_subscribe_loop() yield self._subscription_lock.acquire() self._cancellation_event = Event() combined_channels = self._subscription_state.prepare_channel_list(True) combined_groups = self._subscription_state.prepare_channel_group_list(True) if len(combined_channels) == 0 and len(combined_groups) == 0: return envelope_future = Subscribe(self._pubnub) \ .channels(combined_channels).channel_groups(combined_groups) \ .timetoken(self._timetoken).region(self._region) \ .filter_expression(self._pubnub.config.filter_expression) \ .cancellation_event(self._cancellation_event) \ .future() wi = tornado.gen.WaitIterator( envelope_future, self._cancellation_event.wait()) while not wi.done(): try: result = yield wi.next() except Exception as e: logger.error(e) raise else: if wi.current_future == envelope_future: envelope = result elif wi.current_future == self._cancellation_event.wait(): break self._handle_endpoint_call(envelope.result, envelope.status) self._start_subscribe_loop() except PubNubTornadoException as e: if e.status is not None and e.status.category == PNStatusCategory.PNTimeoutCategory: self._pubnub.ioloop.add_callback(self._start_subscribe_loop) else: self._listener_manager.announce_status(e.status) except Exception as e: logger.error(e) raise finally: self._cancellation_event.set() yield tornado.gen.moment self._cancellation_event = None self._subscription_lock.release() def _stop_subscribe_loop(self): if self._cancellation_event is not None: self._cancellation_event.set() def _stop_heartbeat_timer(self): if self._heartbeat_periodic_callback is not None: self._heartbeat_periodic_callback.stop() def _register_heartbeat_timer(self): super(TornadoSubscriptionManager, self)._register_heartbeat_timer() self._heartbeat_periodic_callback = PeriodicCallback( stack_context.wrap(self._perform_heartbeat_loop), self._pubnub.config.heartbeat_interval * TornadoSubscriptionManager.HEARTBEAT_INTERVAL_MULTIPLIER, self._pubnub.ioloop) self._heartbeat_periodic_callback.start() @tornado.gen.coroutine def _perform_heartbeat_loop(self): if self._heartbeat_call is not None: # TODO: cancel call pass cancellation_event = Event() state_payload = self._subscription_state.state_payload() presence_channels = self._subscription_state.prepare_channel_list(False) presence_groups = self._subscription_state.prepare_channel_group_list(False) if len(presence_channels) == 0 and len(presence_groups) == 0: return try: envelope = yield self._pubnub.heartbeat() \ .channels(presence_channels) \ .channel_groups(presence_groups) \ .state(state_payload) \ .cancellation_event(cancellation_event) \ .future() heartbeat_verbosity = self._pubnub.config.heartbeat_notification_options if envelope.status.is_error: if heartbeat_verbosity == PNHeartbeatNotificationOptions.ALL or \ heartbeat_verbosity == PNHeartbeatNotificationOptions.ALL: self._listener_manager.announce_stateus(envelope.status) else: if heartbeat_verbosity == PNHeartbeatNotificationOptions.ALL: self._listener_manager.announce_stateus(envelope.status) except PubNubTornadoException: pass # TODO: check correctness # if e.status is not None and e.status.category == PNStatusCategory.PNTimeoutCategory: # self._start_subscribe_loop() # else: # self._listener_manager.announce_status(e.status) finally: cancellation_event.set() @tornado.gen.coroutine def _send_leave(self, unsubscribe_operation): envelope = yield Leave(self._pubnub) \ .channels(unsubscribe_operation.channels) \ .channel_groups(unsubscribe_operation.channel_groups).future() self._listener_manager.announce_status(envelope.status)
class TornadoTransmission(): def __init__(self, max_concurrent_batches=10, block_on_send=False, block_on_response=False, max_batch_size=100, send_frequency=0.25, user_agent_addition=''): if not has_tornado: raise ImportError('TornadoTransmission requires tornado, but it was not found.') self.block_on_send = block_on_send self.block_on_response = block_on_response self.max_batch_size = max_batch_size self.send_frequency = send_frequency user_agent = "libhoney-py/" + VERSION if user_agent_addition: user_agent += " " + user_agent_addition self.http_client = AsyncHTTPClient( force_instance=True, defaults=dict(user_agent=user_agent)) # libhoney adds events to the pending queue for us to send self.pending = Queue(maxsize=1000) # we hand back responses from the API on the responses queue self.responses = Queue(maxsize=2000) self.batch_data = {} self.sd = statsd.StatsClient(prefix="libhoney") self.batch_sem = Semaphore(max_concurrent_batches) def start(self): ioloop.IOLoop.current().spawn_callback(self._sender) def send(self, ev): '''send accepts an event and queues it to be sent''' self.sd.gauge("queue_length", self.pending.qsize()) try: if self.block_on_send: self.pending.put(ev) else: self.pending.put_nowait(ev) self.sd.incr("messages_queued") except QueueFull: response = { "status_code": 0, "duration": 0, "metadata": ev.metadata, "body": "", "error": "event dropped; queue overflow", } if self.block_on_response: self.responses.put(response) else: try: self.responses.put_nowait(response) except QueueFull: # if the response queue is full when trying to add an event # queue is full response, just skip it. pass self.sd.incr("queue_overflow") # We're using the older decorator/yield model for compatibility with # Python versions before 3.5. # See: http://www.tornadoweb.org/en/stable/guide/coroutines.html#python-3-5-async-and-await @gen.coroutine def _sender(self): '''_sender is the control loop that pulls events off the `self.pending` queue and submits batches for actual sending. ''' events = [] last_flush = time.time() while True: try: ev = yield self.pending.get(timeout=self.send_frequency) if ev is None: # signals shutdown yield self._flush(events) return events.append(ev) if (len(events) > self.max_batch_size or time.time() - last_flush > self.send_frequency): yield self._flush(events) events = [] except TimeoutError: yield self._flush(events) events = [] last_flush = time.time() @gen.coroutine def _flush(self, events): if not events: return for dest, group in group_events_by_destination(events).items(): yield self._send_batch(dest, group) @gen.coroutine def _send_batch(self, destination, events): ''' Makes a single batch API request with the given list of events. The `destination` argument contains the write key, API host and dataset name used to build the request.''' start = time.time() status_code = 0 try: # enforce max_concurrent_batches yield self.batch_sem.acquire() url = urljoin(urljoin(destination.api_host, "/1/batch/"), destination.dataset) payload = [] for ev in events: event_time = ev.created_at.isoformat() if ev.created_at.tzinfo is None: event_time += "Z" payload.append({ "time": event_time, "samplerate": ev.sample_rate, "data": ev.fields()}) req = HTTPRequest( url, method='POST', headers={ "X-Honeycomb-Team": destination.writekey, "Content-Type": "application/json", }, body=json.dumps(payload, default=json_default_handler), ) self.http_client.fetch(req, self._response_callback) # store the events that were sent so we can process responses later # it is important that we delete these eventually, or we'll run into memory issues self.batch_data[req] = {"start": start, "events": events} except Exception as e: # Catch all exceptions and hand them to the responses queue. self._enqueue_errors(status_code, e, start, events) finally: self.batch_sem.release() def _enqueue_errors(self, status_code, error, start, events): for ev in events: self.sd.incr("send_errors") self._enqueue_response(status_code, "", error, start, ev.metadata) def _response_callback(self, resp): # resp.request should be the same HTTPRequest object built by _send_batch # and mapped to values in batch_data events = self.batch_data[resp.request]["events"] start = self.batch_data[resp.request]["start"] try: status_code = resp.code resp.rethrow() statuses = [d["status"] for d in json.loads(resp.body)] for ev, status in zip(events, statuses): self._enqueue_response(status, "", None, start, ev.metadata) self.sd.incr("messages_sent") except Exception as e: self._enqueue_errors(status_code, e, start, events) self.sd.incr("send_errors") finally: # clean up the data for this batch del self.batch_data[resp.request] def _enqueue_response(self, status_code, body, error, start, metadata): resp = { "status_code": status_code, "body": body, "error": error, "duration": (time.time() - start) * 1000, "metadata": metadata } if self.block_on_response: self.responses.put(resp) else: try: self.responses.put_nowait(resp) except QueueFull: pass def close(self): '''call close to send all in-flight requests and shut down the senders nicely. Times out after max 20 seconds per sending thread plus 10 seconds for the response queue''' try: self.pending.put(None, 10) except QueueFull: pass # signal to the responses queue that nothing more is coming. try: self.responses.put(None, 10) except QueueFull: pass def get_response_queue(self): ''' return the responses queue on to which will be sent the response objects from each event send''' return self.responses
def get_resized(self, gallery, photo, width=None, height=None, quality=60, rotation=0.0, img_format=None, orientation=0): """ Retrieve the given photo in a resized format. """ # Determine the path to the original file. orig_node = self._fs_node.join_node(gallery, photo) if img_format is None: # Detect from original file and quality setting. with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: mime_type = m.id_filename(orig_node.abs_path) self._log.debug('%s/%s detected format %s', gallery, photo, mime_type) if mime_type == 'image/gif': img_format = ImageFormat.GIF else: if quality == 100: # Assume PNG img_format = ImageFormat.PNG else: # Assume JPEG img_format = ImageFormat.JPEG else: # Use the format given by the user img_format = ImageFormat(img_format) self._log.debug('%s/%s using %s format', gallery, photo, img_format.name) # Sanitise dimensions given by user. width, height = self.get_dimensions(gallery, photo, width, height) self._log.debug('%s/%s target dimensions %d by %d', gallery, photo, width, height) # Determine where the file would be cached (cache_dir, cache_name) = self._get_cache_name(gallery, photo, width,height, quality, rotation, img_format) # Do we have this file? data = self._read_cache(orig_node, cache_dir, cache_name) if data is not None: raise Return((img_format, cache_name, data)) # Locate the lock for this photo. mutex_key = (gallery, photo, width, height, quality, rotation, img_format) try: mutex = self._mutexes[mutex_key] except KeyError: mutex = Semaphore(1) self._mutexes[mutex_key] = mutex resize_args = (gallery, photo, width, height, quality, rotation, img_format.value, orientation) try: self._log.debug('%s/%s waiting for mutex', gallery, photo) yield mutex.acquire() # We have the semaphore, call our resize routine. self._log.debug('%s/%s retrieving resized image (args=%s)', gallery, photo, resize_args) (img_format, file_name, file_data) = yield self._pool.apply( func=self._do_resize, args=resize_args) raise Return((img_format, file_name, file_data)) except Return: raise except: self._log.exception('Error resizing photo; gallery: %s, photo: %s, '\ 'width: %d, height: %d, quality: %f, rotation: %f, format: %s', gallery, photo, width, height, quality, rotation, img_format) raise finally: mutex.release()
class ManagedKernelPool(KernelPool): ''' Spawns a pool of kernels. Manages access to individual kernels using a borrower/lender pattern. Cleans them all up when shut down. ''' def __init__(self, prespawn_count, kernel_manager): # Make sure there's at least one kernel as a delegate if not prespawn_count: prespawn_count = 1 super(ManagedKernelPool, self).__init__(prespawn_count, kernel_manager) self.kernel_clients = {} self.on_recv_funcs = {} self.pool_index = 0 self.kernel_pool = [] kernel_ids = self.kernel_manager.list_kernel_ids() self.kernel_semaphore = Semaphore(len(kernel_ids)) # Connect to any prespawned kernels for kernel_id in kernel_ids: self.kernel_clients[kernel_id] = kernel_manager.get_kernel( kernel_id).client() self.kernel_pool.append(kernel_id) iopub = self.kernel_manager.connect_iopub(kernel_id) iopub.on_recv(self.create_on_reply(kernel_id)) @gen.coroutine def acquire(self): ''' Returns a kernel client and id for use and removes the kernel the resource pool. Kernels must be returned via the release method. :return: Returns a kernel client and a kernel id ''' yield self.kernel_semaphore.acquire() kernel_id = self.kernel_pool[0] del self.kernel_pool[0] raise gen.Return((self.kernel_clients[kernel_id], kernel_id)) def release(self, kernel_id): ''' Returns a kernel back to the resource pool. :param kernel_id: Id of the kernel to return to the pool ''' self.kernel_pool.append(kernel_id) self.kernel_semaphore.release() def _on_reply(self, kernel_id, msg_list): idents, msg_list = self.kernel_clients[ kernel_id].session.feed_identities(msg_list) msg = self.kernel_clients[kernel_id].session.deserialize(msg_list) self.on_recv_funcs[kernel_id](msg) def create_on_reply(self, kernel_id): ''' The lambda is used to handle a specific reply per kernel and provide a unique stack scope per invocation. ''' return lambda msg_list: self._on_reply(kernel_id, msg_list) def on_recv(self, kernel_id, func): ''' Registers a callback for io_pub messages for a particular kernel. This is needed to avoid having multiple callbacks per kernel client. :param kernel_id: Id of the kernel :param func: Callback function to handle the message ''' self.on_recv_funcs[kernel_id] = func def shutdown(self): ''' Shuts down all kernels in the pool and in the kernel manager. ''' for kid in self.kernel_clients: self.kernel_clients[kid].stop_channels() self.kernel_manager.shutdown_kernel(kid, now=True) # Any remaining kernels that were not created for our pool should be shutdown super(ManagedKernelPool, self).shutdown()
class HackadayAPI(object): """ Core Hackaday.io API handler. """ HAD_API_URI='https://api.hackaday.io/v1' HAD_AUTH_URI='https://hackaday.io/authorize'\ '?client_id=%(CLIENT_ID)s'\ '&response_type=code' HAD_TOKEN_URI='https://auth.hackaday.io/access_token'\ '?client_id=%(CLIENT_ID)s'\ '&client_secret=%(CLIENT_SECRET)s'\ '&code=%(CODE)s'\ '&grant_type=authorization_code' # Rate limiting RQLIM_TIME=30 # seconds def __init__(self, client_id, client_secret, api_key, api_uri=HAD_API_URI, auth_uri=HAD_AUTH_URI, token_uri=HAD_TOKEN_URI, rqlim_time=RQLIM_TIME, client=None, log=None, io_loop=None): if log is None: log = extdlog.getLogger(self.__class__.__module__) if io_loop is None: io_loop = IOLoop.current() if client is None: client = AsyncHTTPClient() self._client = client self._io_loop = io_loop self._log = log self._client_id = client_id self._client_secret = client_secret self._api_key = api_key self._api_uri = api_uri self._auth_uri = auth_uri self._token_uri = token_uri # Timestamps of last rqlim_num requests self._last_rq = 0.0 self._rqlim_time = rqlim_time # Semaphore to limit concurrent access self._rq_sem = Semaphore(1) # If None, then no "forbidden" status is current. # Otherwise, this stores when the "forbidden" flag expires. self._forbidden_expiry = None @property def is_forbidden(self): """ Return true if the last request returned a "forbidden" response code and was made within the last hour. """ if self._forbidden_expiry is None: return False return self._forbidden_expiry > self._io_loop.time() @coroutine def _ratelimit_sleep(self): """ Ensure we don't exceed the rate limit by tracking the request timestamps and adding a sleep if required. """ now = self._io_loop.time() # Figure out if we need to wait before the next request delay = (self._last_rq + self._rqlim_time) - now self._log.trace('Last request at %f, delay: %f', self._last_rq, delay) if delay <= 0: # Nope, we're clear return self._log.debug('Waiting %f sec for rate limit', delay) yield sleep(delay) self._log.trace('Resuming operations') def _decode(self, response, default_encoding='UTF-8'): """ Decode a given reponse body. """ return decode_body(response.headers['Content-Type'], response.body, default_encoding) @coroutine def api_fetch(self, uri, **kwargs): """ Make a raw request whilst respecting the HAD API request limits. This is primarily to support retrieval of avatars and other data without hitting the HAD.io site needlessly hard. """ if 'connect_timeout' not in kwargs: kwargs['connect_timeout'] = 120.0 if 'request_timeout' not in kwargs: kwargs['request_timeout'] = 120.0 try: yield self._rq_sem.acquire() while True: try: yield self._ratelimit_sleep() response = yield self._client.fetch(uri, **kwargs) self._last_rq = self._io_loop.time() self._log.audit('Request:\n' '%s %s\n' 'Headers: %s\n' 'Response: %s\n' 'Headers: %s\n' 'Body:\n%s', response.request.method, response.request.url, response.request.headers, response.code, response.headers, response_text(response)) break except gaierror as e: if e.errno != EAGAIN: raise raise except HTTPError as e: if e.response is not None: self._log.audit('Request:\n' '%s %s\n' 'Headers: %s\n' 'Response: %s\n' 'Headers: %s\n' 'Body:\n%s', e.response.request.method, e.response.request.url, e.response.request.headers, e.response.code, e.response.headers, response_text(e.response)) if e.code == 403: # Back-end is rate limiting us. Back off an hour. self._forbidden_expiry = self._io_loop.time() \ + 3600.0 raise except ConnectionResetError: # Back-end is blocking us. Back off 15 minutes. self._forbidden_expiry = self._io_loop.time() \ + 900.0 raise finally: self._rq_sem.release() raise Return(response) @coroutine def _api_call(self, uri, query=None, token=None, api_key=True, **kwargs): headers = kwargs.setdefault('headers', {}) headers.setdefault('Accept', 'application/json') if token is not None: headers['Authorization'] = 'token %s' % token if query is None: query = {} if api_key: query.setdefault('api_key', self._api_key) self._log.audit('Query arguments: %r', query) encode_kv = lambda k, v : '%s=%s' % (k, urlparse.quote_plus(str(v))) def encode_item(item): (key, value) = item if isinstance(value, list): return '&'.join(map(lambda v : encode_kv(key, v), value)) else: return encode_kv(key, value) if len(query) > 0: uri += '?%s' % '&'.join(map(encode_item, query.items())) if not uri.startswith('http'): uri = self._api_uri + uri self._log.audit('%s %r', kwargs.get('method','GET'), uri) response = yield self.api_fetch(uri, **kwargs) # If we get here, then our service is back. self._forbidden_expiry = None (ct, ctopts, body) = self._decode(response) if ct.lower() != 'application/json': raise ValueError('Server returned unrecognised type %s' % ct) raise Return(json.loads(body)) # oAuth endpoints @property def auth_uri(self): """ Return the auth URI that we need to send the user to if they're not logged in. """ return self._auth_uri % dict(CLIENT_ID=self._client_id) def get_token(self, code): """ Fetch the token for API queries from the authorization code given. """ # Determine where to retrieve the access token from post_uri = self._token_uri % dict( CLIENT_ID=urlparse.quote_plus(self._client_id), CLIENT_SECRET=urlparse.quote_plus(self._client_secret), CODE=urlparse.quote_plus(code) ) return self._api_call( post_uri, method='POST', body=b'', api_key=False) # Pagination options def _page_query_opts(self, page, per_page): query = {} if page is not None: query['page'] = int(page) if per_page is not None: query['per_page'] = int(per_page) return query # User API endpoints def get_current_user(self, token): """ Fetch the current user's profile information. """ return self._api_call('/me', token=token) def _user_query_opts(self, sortby, page, per_page): query = self._page_query_opts(page, per_page) sortby = UserSortBy(sortby) query['sortby'] = sortby.value return query _GET_USERS_WORKAROUND_RE = re.compile( r'<a href="([^"]+)" class="hacker-image">') _PRIVATE_MSG_LINK_RE = re.compile( r'<a href="/messages/new\?user=(\d+)">') @coroutine def get_user_ids(self, sortby=UserSortBy.influence, page=None): if page is None: page = 1 sortby = UserSortBy(sortby) response = yield self.api_fetch( 'https://hackaday.io/hackers?sort=%s&page=%d' \ % (sortby.value, page)) (ct, ctopts, body) = self._decode(response) # Body is in HTML, look for links to profile pages pages = [] for line in body.split('\n'): match = self._GET_USERS_WORKAROUND_RE.search(line) if match: pages.append(match.group(1)) ids = [] # Fetch each profile page (ugh!) and look for user ID # This is literally all we need at this point, the rest we'll # get from the API. for page in pages: if page.startswith('/'): page = 'https://hackaday.io' + page response = yield self.api_fetch(page) (ct, ctopts, body) = self._decode(response) for line in body.split('\n'): match = self._PRIVATE_MSG_LINK_RE.search(line) if match: ids.append(int(match.group(1))) break raise Return(ids) @coroutine def _get_users_workaround(self, sortby=UserSortBy.influence, page=None): ids = yield self.get_user_ids(sortby, page) users = yield self.get_users(ids=ids) raise Return(users) @coroutine def get_users(self, sortby=UserSortBy.influence, ids=None, page=None, per_page=None): """ Retrieve a list of all users """ query = self._user_query_opts(sortby, page, per_page) if ids is None: # sortby==newest is broken, has been for a while now. if sortby == UserSortBy.newest: result = yield self._get_users_workaround( sortby, query.get('page')) else: result = yield self._api_call('/users', query=query) elif isinstance(ids, slice): query['ids'] = '%d,%d' % (ids.start, ids.stop) result = yield self._api_call('/users/range', query=query) else: ids = set(ids) if len(ids) > 50: raise ValueError('Too many IDs') query['ids'] = ','.join(['%d' % uid for uid in ids]) result = yield self._api_call('/users/batch', query=query) raise Return(result) def search_users(self, screen_name=None, location=None, tag=None, sortby=UserSortBy.influence, page=None, per_page=None): query = self._user_query_opts(sortby, page, per_page) for (arg, val) in ( ('screen_name', screen_name), ('location', location), ('tag', tag) ): if val is not None: query[arg] = str(val) return self._api_call('/users/search', query=query) def get_user(self, user_id): return self._api_call('/users/%d' % user_id) def get_user_followers(self, user_id, sortby=UserSortBy.influence, page=None, per_page=None): query = self._user_query_opts(sortby, page, per_page) return self._api_call('/users/%d/followers' % user_id, query=query) def get_user_following(self, user_id, sortby=UserSortBy.influence, page=None, per_page=None): query = self._user_query_opts(sortby, page, per_page) return self._api_call('/users/%d/following' % user_id, query=query) def get_user_projects(self, user_id, sortby=ProjectSortBy.skulls, page=None, per_page=None): query = self._project_query_opts(sortby, page, per_page) return self._api_call('/users/%d/projects' % user_id, query=query) def get_user_skulls(self, user_id, sortby=UserSortBy.influence, page=None, per_page=None): query = self._user_query_opts(sortby, page, per_page) return self._api_call('/users/%d/skulls' % user_id, query=query) def get_user_links(self, user_id, page=None, per_page=None): query = self._page_query_opts(page, per_page) return self._api_call('/users/%d/links' % user_id, query=query) def get_user_tags(self, user_id, page=None, per_page=None): query = self._page_query_opts(page, per_page) return self._api_call('/users/%d/tags' % user_id, query=query) def get_user_pages(self, user_id, page=None, per_page=None): query = self._page_query_opts(page, per_page) return self._api_call('/users/%d/pages' % user_id, query=query) # Projects API def _project_query_opts(self, sortby, page, per_page): query = self._page_query_opts(page, per_page) sortby = ProjectSortBy(sortby) query['sortby'] = sortby.value return query def get_projects(self, sortby=ProjectSortBy.skulls, ids=None, page=None, per_page=None): """ Retrieve a list of all projects """ query = self._project_query_opts(sortby, page, per_page) if ids is None: return self._api_call('/projects', query=query) elif isinstance(ids, slice): query['ids'] = '%d,%d' % (slice.start, slice.stop) return self._api_call('/projects/range', query=query) else: ids = set(ids) if len(ids) > 50: raise ValueError('Too many IDs') query['ids'] = ','.join(['%d' % pid for pid in ids]) return self._api_call('/projects/batch', query=query) def search_projects(self, term, sortby=ProjectSortBy.skulls, page=None, per_page=None): query = self._project_query_opts(sortby, page, per_page) query['search_term'] = str(term) return self._api_call('/projects/search', query=query) def get_project(self, project_id): return self._api_call('/projects/%d' % project_id) def get_project_team(self, project_id, sortby=UserSortBy.influence, page=None, per_page=None): query = self._user_query_opts(sortby, page, per_page) return self._api_call('/projects/%d/team' % project_id, query=query) def get_project_followers(self, project_id, sortby=UserSortBy.influence, page=None, per_page=None): query = self._user_query_opts(sortby, page, per_page) return self._api_call('/projects/%d/followers' % project_id, query=query) def get_project_skulls(self, project_id, sortby=UserSortBy.influence, page=None, per_page=None): query = self._user_query_opts(sortby, page, per_page) return self._api_call('/projects/%d/skulls' % project_id, query=query) def get_project_comments(self, project_id, sortby=UserSortBy.influence, page=None, per_page=None): query = self._user_query_opts(sortby, page, per_page) return self._api_call('/projects/%d/comments' % project_id, query=query) def get_project_links(self, project_id, page=None, per_page=None): query = self._page_query_opts(page, per_page) return self._api_call('/projects/%d/links' % project_id, query=query) def get_project_images(self, project_id, page=None, per_page=None): query = self._page_query_opts(page, per_page) return self._api_call('/projects/%d/images' % project_id, query=query) def get_project_components(self, project_id, page=None, per_page=None): query = self._page_query_opts(page, per_page) return self._api_call('/projects/%d/components' % project_id, query=query) def get_project_tags(self, project_id, page=None, per_page=None): query = self._page_query_opts(page, per_page) return self._api_call('/projects/%d/tags' % project_id, query=query) def get_project_logs(self, project_id, page=None, per_page=None): query = self._page_query_opts(page, per_page) return self._api_call('/projects/%d/logs' % project_id, query=query) def get_project_instructions(self, project_id, page=None, per_page=None): query = self._page_query_opts(page, per_page) return self._api_call('/projects/%d/instructions' % project_id, query=query) def get_project_details(self, project_id, page=None, per_page=None): query = self._page_query_opts(page, per_page) return self._api_call('/projects/%d/details' % project_id, query=query)
class WorkerPool(object): """ The WorkerPool object represents a pool of worker threads which each run a task in an external thread. """ def __init__(self, workers=None, io_loop=None): if workers is None: workers = cpu_count() if io_loop is None: io_loop = IOLoop.current() self._io_loop = io_loop self._sem = Semaphore(workers) self._queue = Queue() self._active = False @coroutine def apply(self, func, args=None, kwds=None): """ Enqueue a request to be processed in a worker thread. """ if args is None: args = () if kwds is None: kwds = {} # Our result placeholder future = Future() # Enqueue the request yield self._queue.put((future, func, args, kwds)) # Kick-start the queue manager if not already running self._io_loop.add_callback(self._queue_manager) # Get back the result result = yield future raise Return(result) @coroutine def _apply(self, future, func, args=None, kwds=None): """ Execute a function in a worker thread. Wrapper function. """ yield self._sem.acquire() # Receive the result back; sets the future result def _recv_result(err, res): self._sem.release() if err is not None: future.set_exc_info(err) else: future.set_result(res) # Run the function; in a worker thread def _exec(): err = None res = None try: res = func(*args, **kwds) except: err = exc_info() self._io_loop.add_callback(_recv_result, err, res) # Spawn the worker thread thread = Thread(target=_exec) thread.start() @coroutine def _queue_manager(self): """ Queue manager co-routine. """ if self._active: # Already active return try: self._active = True while True: (future, func, args, kwds) = yield self._queue.get() self._io_loop.add_callback(self._apply, future, func, args, kwds) finally: self._active = False
class Crawler(object): def _init_defaults(self): self.start_link = None self.link_priority = 2 self.img_priority = 8 self.politeness = 2 self.workers_limit = 10 # allow at most 10 concurrent workers self.link_regex = re.compile("^http://.*") self.img_regex = re.compile(".*") self.fname_digits = 4 self.min_width = 200 self.min_height = 200 self.img_dir = "E:/tmp/" self.idle_wait_loops = 100 self.port = 8888 def _load_config(self): parser = ConfigParser.ConfigParser() parser.read("config.ini") if parser.has_option("global", "starturl"): starturl = parser.get("global", "starturl") self.start_link = starturl if parser.has_option("global", "linkregex"): self.link_regex = re.compile(parser.get("global", "linkregex")) if parser.has_option("global", "imgregex"): self.img_regex = re.compile(parser.get("global", "imgregex")) if parser.has_option("global", "politeness"): politeness = parser.getint("global", "politeness") if politeness <= 0: print "politeness must be a positive integer" raise SystemExit() self.politeness = politeness if parser.has_option("global", "imgdir"): imgdir = parser.get("global", "imgdir") if not os.path.exists(imgdir) or not os.path.isdir(imgdir): print "invalid imgdir configuration" raise SystemExit() if not imgdir.endswith("/"): imgdir += "/" self.img_dir = imgdir if parser.has_option("global", "minwidth"): width = parser.getint("global", "minwidth") self.min_width = width if parser.has_option("global", "minheight"): height = parser.getint("global", "minheight") self.min_height = height def __init__(self, start_link=None): self._init_defaults() # Now load the config file to override defaults self._load_config() if start_link: self.start_link = start_link if not self.start_link: raise SystemExit("No start link is provided, exiting now...") links.put(self.start_link) self.semaphore = Semaphore(self.workers_limit) @gen.coroutine def run(self): # First start an debug server app = Application([(r"/", WebHandler)]) server = HTTPServer(app) server.listen(self.port) idle_loops = 0 while True: if imageurls.qsize() == 0 and links.qsize() == 0: print "Both link and image queues are empty now" idle_loops += 1 if idle_loops == self.idle_wait_loops: break else: idle_loops = 0 # clear the idle loop counter if imageurls.qsize() == 0: self.handle_links() elif links.qsize() == 0: self.handle_imageurls() else: choices = [0] * self.link_priority + [1] * self.img_priority choice = random.choice(choices) if choice: self.handle_imageurls() else: self.handle_links() yield gen.sleep(0.1 * self.politeness) # Wait for all link handlers links.join() # Handling imageurls if generated by the last few links while imageurls.qsize(): self.handle_imageurls() imageurls.join() @gen.coroutine def handle_links(self): yield self.semaphore.acquire() newlink = yield links.get() # Make sure we haven't visited this one if newlink in visited_links: self.semaphore.release() raise gen.Return() visited_links.add(newlink) # use async client to fetch this url client = AsyncHTTPClient() tries = 3 # Give it 3 chances before putting it in failure while tries: response = yield client.fetch(newlink) if response.code == 200: break tries -= 1 # release the semaphore self.semaphore.release() if response.code != 200: link_failures.append(newlink) print "[FAILURE] - %s" % newlink raise gen.Return() # TODO: replace this with a report api print "[VISITED] - %s" % newlink # parse url to get the base url components = urlparse.urlparse(newlink) baseurl = components[0] + "://" + components[1] path = components[2] # parse the html with bs soup = bs4.BeautifulSoup(response.body) # extract valid links and put into links a_tags = soup.find_all("a") for tag in a_tags: if "href" not in tag.attrs: continue href = tag["href"] if href.startswith("#"): continue if href.startswith("/"): # relative href = baseurl + href else: if not path.endswith("/"): path = path[: path.rfind("/") + 1] href = baseurl + "/" + path + href if not self.link_regex.match(href): continue if href in visited_links: continue links.put(href) print "NEWLINK:", href # extract imgs and put into imageurls img_tags = soup.find_all("img") for tag in img_tags: if "src" not in tag.attrs: continue src = tag["src"] if src.startswith("/"): # relative src = baseurl + src if not self.img_regex.match(src): continue if src in downloaded_images: continue imageurls.put(src) print "NEW IMAGE:", src # now the task is done links.task_done() @gen.coroutine def handle_imageurls(self): yield self.semaphore.acquire() imgurl = yield imageurls.get() if imgurl in downloaded_images: self.semaphore.release() raise gen.Return() # mark the image as downloaded downloaded_images.add(imgurl) # use async client to fetch this url client = AsyncHTTPClient() tries = 3 # Give it 3 chances before putting it in failure while tries: response = yield client.fetch(imgurl) if response.code == 200: break tries -= 1 # Download is finished, release semaphore self.semaphore.release() if response.code != 200: download_failures.append(imgurl) print "[FAILURE] - %s" % imgurl raise gen.Return() # TODO: replace this with a report api print "[DOWNLOADED] - %s" % imgurl # Read the file content img = PIL.Image.open(response.buffer) w, h = img.size if w < self.min_width or h < self.min_height: raise gen.Return() # find out the image extension, default to jpg if "." in imgurl: ext = imgurl.split(".")[-1].lower() if ext not in ["jpg", "png", "gif"]: ext = "jpg" elif img.format: ext = img.format.lower() else: ext = "jpg" # increment the counter global img_counter img_counter += 1 fname = str(img_counter).zfill(self.fname_digits) + "." + ext fpath = self.img_dir + fname # save the image file f = open(fpath, "wb") f.write(response.body) # now the task is done imageurls.task_done()
class IPCMessageSubscriber(IPCClient): ''' Salt IPC message subscriber Create an IPC client to receive messages from IPC publisher An example of a very simple IPCMessageSubscriber connecting to an IPCMessagePublisher. This example assumes an already running IPCMessagePublisher. IMPORTANT: The below example also assumes the IOLoop is NOT running. # Import Tornado libs import tornado.ioloop # Import Salt libs import salt.config import salt.transport.ipc # Create a new IO Loop. # We know that this new IO Loop is not currently running. io_loop = tornado.ioloop.IOLoop() ipc_publisher_socket_path = '/var/run/ipc_publisher.ipc' ipc_subscriber = salt.transport.ipc.IPCMessageSubscriber(ipc_server_socket_path, io_loop=io_loop) # Connect to the server # Use the associated IO Loop that isn't running. io_loop.run_sync(ipc_subscriber.connect) # Wait for some data package = ipc_subscriber.read_sync() ''' def __singleton_init__(self, socket_path, io_loop=None): super(IPCMessageSubscriber, self).__singleton_init__(socket_path, io_loop=io_loop) self._read_sync_future = None self._read_stream_future = None self._sync_ioloop_running = False self.saved_data = [] self._sync_read_in_progress = Semaphore() self.callbacks = set() self.reading = False @tornado.gen.coroutine def _read_sync(self, timeout): yield self._sync_read_in_progress.acquire() exc_to_raise = None ret = None try: while True: if self._read_stream_future is None: self._read_stream_future = self.stream.read_bytes( 4096, partial=True) if timeout is None: wire_bytes = yield self._read_stream_future else: future_with_timeout = FutureWithTimeout( self.io_loop, self._read_stream_future, timeout) wire_bytes = yield future_with_timeout self._read_stream_future = None # Remove the timeout once we get some data or an exception # occurs. We will assume that the rest of the data is already # there or is coming soon if an exception doesn't occur. timeout = None self.unpacker.feed(wire_bytes) first = True for framed_msg in self.unpacker: if first: ret = framed_msg['body'] first = False else: self.saved_data.append(framed_msg['body']) if not first: # We read at least one piece of data break except TornadoTimeoutError: # In the timeout case, just return None. # Keep 'self._read_stream_future' alive. ret = None except tornado.iostream.StreamClosedError as exc: log.trace('Subscriber disconnected from IPC %s', self.socket_path) self._read_stream_future = None exc_to_raise = exc except Exception as exc: log.error( 'Exception occurred in Subscriber while handling stream: %s', exc) self._read_stream_future = None exc_to_raise = exc if self._sync_ioloop_running: # Stop the IO Loop so that self.io_loop.start() will return in # read_sync(). self.io_loop.spawn_callback(self.io_loop.stop) if exc_to_raise is not None: raise exc_to_raise # pylint: disable=E0702 self._sync_read_in_progress.release() raise tornado.gen.Return(ret) def read_sync(self, timeout=None): ''' Read a message from an IPC socket The socket must already be connected. The associated IO Loop must NOT be running. :param int timeout: Timeout when receiving message :return: message data if successful. None if timed out. Will raise an exception for all other error conditions. ''' if self.saved_data: return self.saved_data.pop(0) self._sync_ioloop_running = True self._read_sync_future = self._read_sync(timeout) self.io_loop.start() self._sync_ioloop_running = False ret_future = self._read_sync_future self._read_sync_future = None return ret_future.result() @tornado.gen.coroutine def _read_async(self, callback): while not self.stream.closed(): try: self._read_stream_future = self.stream.read_bytes(4096, partial=True) self.reading = True wire_bytes = yield self._read_stream_future self._read_stream_future = None self.unpacker.feed(wire_bytes) for framed_msg in self.unpacker: body = framed_msg['body'] self.io_loop.spawn_callback(callback, body) except tornado.iostream.StreamClosedError: log.trace('Subscriber disconnected from IPC %s', self.socket_path) break except Exception as exc: log.error( 'Exception occurred while Subscriber handling stream: %s', exc) yield tornado.gen.sleep(1) def __run_callbacks(self, raw): for callback in self.callbacks: self.io_loop.spawn_callback(callback, raw) @tornado.gen.coroutine def read_async(self): ''' Asynchronously read messages and invoke a callback when they are ready. :param callback: A callback with the received data ''' while not self.connected(): try: yield self.connect(timeout=5) except tornado.iostream.StreamClosedError: log.trace('Subscriber closed stream on IPC %s before connect', self.socket_path) yield tornado.gen.sleep(1) except Exception as exc: log.error('Exception occurred while Subscriber connecting: %s', exc) yield tornado.gen.sleep(1) yield self._read_async(self.__run_callbacks) def close(self): ''' Routines to handle any cleanup before the instance shuts down. Sockets and filehandles should be closed explicitly, to prevent leaks. ''' if not self._closing: IPCClient.close(self) if self._closing: # This will prevent this message from showing up: # '[ERROR ] Future exception was never retrieved: # StreamClosedError' if self._read_sync_future is not None and self._read_sync_future.done( ): self._read_sync_future.exception() if self._read_stream_future is not None and self._read_stream_future.done( ): self._read_stream_future.exception()
class IPCMessageSubscriber(IPCClient): ''' Salt IPC message subscriber Create an IPC client to receive messages from IPC publisher An example of a very simple IPCMessageSubscriber connecting to an IPCMessagePublisher. This example assumes an already running IPCMessagePublisher. IMPORTANT: The below example also assumes the IOLoop is NOT running. # Import Tornado libs import tornado.ioloop # Import Salt libs import salt.config import salt.transport.ipc # Create a new IO Loop. # We know that this new IO Loop is not currently running. io_loop = tornado.ioloop.IOLoop() ipc_publisher_socket_path = '/var/run/ipc_publisher.ipc' ipc_subscriber = salt.transport.ipc.IPCMessageSubscriber(ipc_server_socket_path, io_loop=io_loop) # Connect to the server # Use the associated IO Loop that isn't running. io_loop.run_sync(ipc_subscriber.connect) # Wait for some data package = ipc_subscriber.read_sync() ''' def __singleton_init__(self, socket_path, io_loop=None): super(IPCMessageSubscriber, self).__singleton_init__( socket_path, io_loop=io_loop) self._read_sync_future = None self._read_stream_future = None self._sync_ioloop_running = False self.saved_data = [] self._sync_read_in_progress = Semaphore() @tornado.gen.coroutine def _read_sync(self, timeout): yield self._sync_read_in_progress.acquire() exc_to_raise = None ret = None try: while True: if self._read_stream_future is None: self._read_stream_future = self.stream.read_bytes(4096, partial=True) if timeout is None: wire_bytes = yield self._read_stream_future else: future_with_timeout = FutureWithTimeout( self.io_loop, self._read_stream_future, timeout) wire_bytes = yield future_with_timeout self._read_stream_future = None # Remove the timeout once we get some data or an exception # occurs. We will assume that the rest of the data is already # there or is coming soon if an exception doesn't occur. timeout = None self.unpacker.feed(wire_bytes) first = True for framed_msg in self.unpacker: if first: ret = framed_msg['body'] first = False else: self.saved_data.append(framed_msg['body']) if not first: # We read at least one piece of data break except tornado.ioloop.TimeoutError: # In the timeout case, just return None. # Keep 'self._read_stream_future' alive. ret = None except tornado.iostream.StreamClosedError as exc: log.trace('Subscriber disconnected from IPC {0}'.format(self.socket_path)) self._read_stream_future = None exc_to_raise = exc except Exception as exc: log.error('Exception occurred in Subscriber while handling stream: {0}'.format(exc)) self._read_stream_future = None exc_to_raise = exc if self._sync_ioloop_running: # Stop the IO Loop so that self.io_loop.start() will return in # read_sync(). self.io_loop.spawn_callback(self.io_loop.stop) if exc_to_raise is not None: raise exc_to_raise # pylint: disable=E0702 self._sync_read_in_progress.release() raise tornado.gen.Return(ret) def read_sync(self, timeout=None): ''' Read a message from an IPC socket The socket must already be connected. The associated IO Loop must NOT be running. :param int timeout: Timeout when receiving message :return: message data if successful. None if timed out. Will raise an exception for all other error conditions. ''' if self.saved_data: return self.saved_data.pop(0) self._sync_ioloop_running = True self._read_sync_future = self._read_sync(timeout) self.io_loop.start() self._sync_ioloop_running = False ret_future = self._read_sync_future self._read_sync_future = None return ret_future.result() @tornado.gen.coroutine def _read_async(self, callback): while not self.stream.closed(): try: self._read_stream_future = self.stream.read_bytes(4096, partial=True) wire_bytes = yield self._read_stream_future self._read_stream_future = None self.unpacker.feed(wire_bytes) for framed_msg in self.unpacker: body = framed_msg['body'] self.io_loop.spawn_callback(callback, body) except tornado.iostream.StreamClosedError: log.trace('Subscriber disconnected from IPC {0}'.format(self.socket_path)) break except Exception as exc: log.error('Exception occurred while Subscriber handling stream: {0}'.format(exc)) @tornado.gen.coroutine def read_async(self, callback): ''' Asynchronously read messages and invoke a callback when they are ready. :param callback: A callback with the received data ''' while not self.connected(): try: yield self.connect(timeout=5) except tornado.iostream.StreamClosedError: log.trace('Subscriber closed stream on IPC {0} before connect'.format(self.socket_path)) yield tornado.gen.sleep(1) except Exception as exc: log.error('Exception occurred while Subscriber connecting: {0}'.format(exc)) yield tornado.gen.sleep(1) yield self._read_async(callback) def close(self): ''' Routines to handle any cleanup before the instance shuts down. Sockets and filehandles should be closed explicitly, to prevent leaks. ''' if not self._closing: IPCClient.close(self) # This will prevent this message from showing up: # '[ERROR ] Future exception was never retrieved: # StreamClosedError' if self._read_sync_future is not None: self._read_sync_future.exc_info() if self._read_stream_future is not None: self._read_stream_future.exc_info() def __del__(self): if IPCMessageSubscriber in globals(): self.close()
class TornadoTransmission(): def __init__(self, max_concurrent_batches=10, block_on_send=False, block_on_response=False, max_batch_size=100, send_frequency=timedelta(seconds=0.25), user_agent_addition=''): if not has_tornado: raise ImportError( 'TornadoTransmission requires tornado, but it was not found.' ) self.block_on_send = block_on_send self.block_on_response = block_on_response self.max_batch_size = max_batch_size self.send_frequency = send_frequency user_agent = "libhoney-py/" + VERSION if user_agent_addition: user_agent += " " + user_agent_addition self.http_client = AsyncHTTPClient( force_instance=True, defaults=dict(user_agent=user_agent)) # libhoney adds events to the pending queue for us to send self.pending = Queue(maxsize=1000) # we hand back responses from the API on the responses queue self.responses = Queue(maxsize=2000) self.batch_data = {} self.sd = statsd.StatsClient(prefix="libhoney") self.batch_sem = Semaphore(max_concurrent_batches) def start(self): ioloop.IOLoop.current().spawn_callback(self._sender) def send(self, ev): '''send accepts an event and queues it to be sent''' self.sd.gauge("queue_length", self.pending.qsize()) try: if self.block_on_send: self.pending.put(ev) else: self.pending.put_nowait(ev) self.sd.incr("messages_queued") except QueueFull: response = { "status_code": 0, "duration": 0, "metadata": ev.metadata, "body": "", "error": "event dropped; queue overflow", } if self.block_on_response: self.responses.put(response) else: try: self.responses.put_nowait(response) except QueueFull: # if the response queue is full when trying to add an event # queue is full response, just skip it. pass self.sd.incr("queue_overflow") # We're using the older decorator/yield model for compatibility with # Python versions before 3.5. # See: http://www.tornadoweb.org/en/stable/guide/coroutines.html#python-3-5-async-and-await @gen.coroutine def _sender(self): '''_sender is the control loop that pulls events off the `self.pending` queue and submits batches for actual sending. ''' events = [] last_flush = time.time() while True: try: ev = yield self.pending.get(timeout=self.send_frequency) if ev is None: # signals shutdown yield self._flush(events) return events.append(ev) if (len(events) > self.max_batch_size or time.time() - last_flush > self.send_frequency.total_seconds()): yield self._flush(events) events = [] except TimeoutError: yield self._flush(events) events = [] last_flush = time.time() @gen.coroutine def _flush(self, events): if not events: return for dest, group in group_events_by_destination(events).items(): yield self._send_batch(dest, group) @gen.coroutine def _send_batch(self, destination, events): ''' Makes a single batch API request with the given list of events. The `destination` argument contains the write key, API host and dataset name used to build the request.''' start = time.time() status_code = 0 try: # enforce max_concurrent_batches yield self.batch_sem.acquire() url = urljoin(urljoin(destination.api_host, "/1/batch/"), destination.dataset) payload = [] for ev in events: event_time = ev.created_at.isoformat() if ev.created_at.tzinfo is None: event_time += "Z" payload.append({ "time": event_time, "samplerate": ev.sample_rate, "data": ev.fields() }) req = HTTPRequest( url, method='POST', headers={ "X-Honeycomb-Team": destination.writekey, "Content-Type": "application/json", }, body=json.dumps(payload, default=json_default_handler), ) self.http_client.fetch(req, self._response_callback) # store the events that were sent so we can process responses later # it is important that we delete these eventually, or we'll run into memory issues self.batch_data[req] = {"start": start, "events": events} except Exception as e: # Catch all exceptions and hand them to the responses queue. self._enqueue_errors(status_code, e, start, events) finally: self.batch_sem.release() def _enqueue_errors(self, status_code, error, start, events): for ev in events: self.sd.incr("send_errors") self._enqueue_response(status_code, "", error, start, ev.metadata) def _response_callback(self, resp): # resp.request should be the same HTTPRequest object built by _send_batch # and mapped to values in batch_data events = self.batch_data[resp.request]["events"] start = self.batch_data[resp.request]["start"] try: status_code = resp.code resp.rethrow() statuses = [d["status"] for d in json.loads(resp.body)] for ev, status in zip(events, statuses): self._enqueue_response(status, "", None, start, ev.metadata) self.sd.incr("messages_sent") except Exception as e: self._enqueue_errors(status_code, e, start, events) self.sd.incr("send_errors") finally: # clean up the data for this batch del self.batch_data[resp.request] def _enqueue_response(self, status_code, body, error, start, metadata): resp = { "status_code": status_code, "body": body, "error": error, "duration": (time.time() - start) * 1000, "metadata": metadata } if self.block_on_response: self.responses.put(resp) else: try: self.responses.put_nowait(resp) except QueueFull: pass def close(self): '''call close to send all in-flight requests and shut down the senders nicely. Times out after max 20 seconds per sending thread plus 10 seconds for the response queue''' try: self.pending.put(None, 10) except QueueFull: pass # signal to the responses queue that nothing more is coming. try: self.responses.put(None, 10) except QueueFull: pass def get_response_queue(self): ''' return the responses queue on to which will be sent the response objects from each event send''' return self.responses
class Crawler(object): def _init_defaults(self): self.start_link = None self.link_priority = 2 self.img_priority = 8 self.politeness = 2 self.workers_limit = 10 # allow at most 10 concurrent workers self.link_regex = re.compile("^http://.*") self.img_regex = re.compile(".*") self.fname_digits = 4 self.min_width = 200 self.min_height = 200 self.img_dir = "E:/tmp/" self.idle_wait_loops = 100 self.port = 8888 def _load_config(self): parser = ConfigParser.ConfigParser() parser.read("config.ini") if parser.has_option("global", "starturl"): starturl = parser.get("global", "starturl") self.start_link = starturl if parser.has_option("global", "linkregex"): self.link_regex = re.compile(parser.get("global", "linkregex")) if parser.has_option("global", "imgregex"): self.img_regex = re.compile(parser.get("global", "imgregex")) if parser.has_option("global", "politeness"): politeness = parser.getint("global", "politeness") if politeness <=0: print "politeness must be a positive integer" raise SystemExit() self.politeness = politeness if parser.has_option("global", "imgdir"): imgdir = parser.get("global", "imgdir") if not os.path.exists(imgdir) or not os.path.isdir(imgdir): print "invalid imgdir configuration" raise SystemExit() if not imgdir.endswith("/"): imgdir+="/" self.img_dir = imgdir if parser.has_option("global", "minwidth"): width = parser.getint("global", "minwidth") self.min_width = width if parser.has_option("global", "minheight"): height = parser.getint("global", "minheight") self.min_height = height def __init__(self, start_link=None): self._init_defaults() # Now load the config file to override defaults self._load_config() if start_link: self.start_link = start_link if not self.start_link: raise SystemExit("No start link is provided, exiting now...") links.put(self.start_link) self.semaphore = Semaphore(self.workers_limit) @gen.coroutine def run(self): # First start an debug server app = Application([(r"/", WebHandler)]) server = HTTPServer(app) server.listen(self.port) idle_loops = 0 while True: if imageurls.qsize()==0 and links.qsize()==0: print "Both link and image queues are empty now" idle_loops += 1 if idle_loops == self.idle_wait_loops: break else: idle_loops = 0 # clear the idle loop counter if imageurls.qsize()==0: self.handle_links() elif links.qsize()==0: self.handle_imageurls() else: choices = [0]*self.link_priority +[1]*self.img_priority choice = random.choice(choices) if choice: self.handle_imageurls() else: self.handle_links() yield gen.sleep(0.1 * self.politeness) # Wait for all link handlers links.join() # Handling imageurls if generated by the last few links while imageurls.qsize(): self.handle_imageurls() imageurls.join() @gen.coroutine def handle_links(self): yield self.semaphore.acquire() newlink = yield links.get() # Make sure we haven't visited this one if newlink in visited_links: self.semaphore.release() raise gen.Return() visited_links.add(newlink) # use async client to fetch this url client = AsyncHTTPClient() tries = 3 # Give it 3 chances before putting it in failure while tries: response = yield client.fetch(newlink) if response.code==200: break tries -= 1 # release the semaphore self.semaphore.release() if response.code!=200: link_failures.append(newlink) print "[FAILURE] - %s"%newlink raise gen.Return() # TODO: replace this with a report api print "[VISITED] - %s"%newlink # parse url to get the base url components = urlparse.urlparse(newlink) baseurl = components[0]+"://"+components[1] path = components[2] # parse the html with bs soup = bs4.BeautifulSoup(response.body) # extract valid links and put into links a_tags = soup.find_all("a") for tag in a_tags: if "href" not in tag.attrs: continue href = tag['href'] if href.startswith("#"): continue if href.startswith("/"): # relative href = baseurl+href else: if not path.endswith("/"): path = path[:path.rfind("/")+1] href = baseurl+"/"+path+href if not self.link_regex.match(href): continue if href in visited_links: continue links.put(href) print "NEWLINK:", href # extract imgs and put into imageurls img_tags = soup.find_all("img") for tag in img_tags: if "src" not in tag.attrs: continue src = tag['src'] if src.startswith("/"): # relative src = baseurl+src if not self.img_regex.match(src): continue if src in downloaded_images: continue imageurls.put(src) print "NEW IMAGE:", src # now the task is done links.task_done() @gen.coroutine def handle_imageurls(self): yield self.semaphore.acquire() imgurl = yield imageurls.get() if imgurl in downloaded_images: self.semaphore.release() raise gen.Return() # mark the image as downloaded downloaded_images.add(imgurl) # use async client to fetch this url client = AsyncHTTPClient() tries = 3 # Give it 3 chances before putting it in failure while tries: response = yield client.fetch(imgurl) if response.code==200: break tries -= 1 # Download is finished, release semaphore self.semaphore.release() if response.code!=200: download_failures.append(imgurl) print "[FAILURE] - %s"%imgurl raise gen.Return() # TODO: replace this with a report api print "[DOWNLOADED] - %s"%imgurl # Read the file content img = PIL.Image.open(response.buffer) w, h = img.size if w <self.min_width or h <self.min_height: raise gen.Return() # find out the image extension, default to jpg if "." in imgurl: ext = imgurl.split(".")[-1].lower() if ext not in ["jpg", "png", "gif"]: ext = "jpg" elif img.format: ext = img.format.lower() else: ext = "jpg" # increment the counter global img_counter img_counter += 1 fname = str(img_counter).zfill(self.fname_digits)+"."+ext fpath = self.img_dir + fname # save the image file f = open(fpath, "wb") f.write(response.body) # now the task is done imageurls.task_done()