def consume_data(topic, borker, group_id): consumer = Consumer(brokers=[borker], group_id=group_id) consumer.subscribe([topic, "thing-event"]) for msg in consumer: try: value = msg.value print(value) except Exception as err: print(err) continue
class Crawler: MAX_URL = 10 def __init__(self): self.url_counter = 1 self.document_client = DocumentClient() self.indexing_client = IndexingClient() self.pagerank_client = PagerankClient() self.producer = Producer('url_queue') self.consumer = Consumer('url_queue') def run(self): self.consumer.subscribe(self.run_for_url) def run_for_url(self, ch, method, properties, body): doc_url = body.decode("utf-8") print("[Crawler] Received %r" % doc_url) document_text = WebScraper.get_text(doc_url) document_links = WebScraper.get_links(doc_url) hash_object = hashlib.sha256(document_text.encode("utf-8")) digest = hash_object.hexdigest() doc_record = self.document_client.get_by_url(doc_url) if "id" not in doc_record: doc_record = self.document_client.create(doc_url, digest) doc_indexed = self.indexing_client.get_by_id(doc_record["id"]) if "url" not in doc_indexed: self.indexing_client.index(doc_record["id"], doc_url, document_text) if doc_record["digest"] != digest: self.document_client.update_digest(doc_record["id"], digest) self.indexing_client.update_content(doc_record["id"], document_text) for link in document_links: if self.url_counter < Crawler.MAX_URL: self.url_counter += 1 child_doc_record = self.document_client.get_by_url(link.geturl()) if "id" not in child_doc_record: child_doc_record = self.document_client.create(link.geturl(), "digest") self.document_client.create_link(doc_record["id"], child_doc_record["id"]) self.producer.publish(link.geturl()) self.pagerank_client.update(doc_record["id"])
class MessageBus: def __init__(self, broker_url='amqp://localhost', queue_prefix=None, exchange='messagebus'): self.broker_url = broker_url self.consumer = Consumer(self.broker_url, queue_prefix, exchange) self._queue_prefix = queue_prefix self.exchange = exchange def publish(self, message, payload={}): self._publish(message, payload) def _publish(self, message, payload, correlation_id=None): body = json.dumps(self._prepare_payload(payload), ensure_ascii=False) connection = pika.BlockingConnection( pika.URLParameters(self.broker_url)) channel = connection.channel() properties = None if correlation_id: properties = pika.BasicProperties(correlation_id=correlation_id) channel.basic_publish(exchange=self.exchange, routing_key=message, body=body, properties=properties) connection.close() def _prepare_payload(self, payload): def serialize(value): if isinstance(value, datetime.datetime): return value.isoformat() return value proc_payload = {k: serialize(v) for k, v in payload.items()} if 'timestamp' not in proc_payload: proc_payload['timestamp'] = datetime.datetime.utcnow().isoformat() return proc_payload def subscribe(self, message, callback): self.consumer.subscribe(message, callback) def subscribe_and_publish_response(self, message, callback): def subscribe_callback(request_payload, **kwargs): correlation_id = kwargs['properties'].correlation_id response = callback(request_payload) self._publish(message + '.answered', response, correlation_id) self.consumer.subscribe(message, subscribe_callback, transient_queue=True) def publish_and_get_response(self, message, payload, timeout_secs=5): sent_correlation = str(uuid.uuid1()) consumer_ready = Event() def on_consumer_ready(): consumer_ready.set() consumer = Consumer(self.broker_url, self._queue_prefix, self.exchange) consumer.on_connection_setup_finished = on_consumer_ready response = {} response_received = Event() def response_callback(response_payload, **kwargs): if not sent_correlation == kwargs['properties'].correlation_id: return response['payload'] = response_payload response_received.set() def wait_for_response(): consumer.subscribe(message + '.answered', response_callback, transient_queue=True) consumer.start() thread = Thread(target=wait_for_response) thread.daemon = True thread.start() consumer_ready.wait(2) self._publish(message, payload, correlation_id=sent_correlation) timed_out = not response_received.wait(timeout_secs) if timed_out: raise MessageBusTimeoutError() consumer.stop() return response.get('payload') def start(self): self.consumer.start() def stop(self): self.consumer.stop()