Пример #1
0
def consume_data(topic, borker, group_id):
    consumer = Consumer(brokers=[borker], group_id=group_id)

    consumer.subscribe([topic, "thing-event"])

    for msg in consumer:
        try:
            value = msg.value
            print(value)
        except Exception as err:
            print(err)
            continue
Пример #2
0
class Crawler:
  MAX_URL = 10

  def __init__(self):
    self.url_counter = 1
    self.document_client = DocumentClient()
    self.indexing_client = IndexingClient()
    self.pagerank_client = PagerankClient()
    self.producer = Producer('url_queue')
    self.consumer = Consumer('url_queue')

  def run(self):
    self.consumer.subscribe(self.run_for_url)

  def run_for_url(self, ch, method, properties, body):
    doc_url = body.decode("utf-8")
    print("[Crawler] Received %r" % doc_url)

    document_text = WebScraper.get_text(doc_url)
    document_links = WebScraper.get_links(doc_url)

    hash_object = hashlib.sha256(document_text.encode("utf-8"))
    digest = hash_object.hexdigest()

    doc_record = self.document_client.get_by_url(doc_url)
    if "id" not in doc_record:
      doc_record = self.document_client.create(doc_url, digest)

    doc_indexed = self.indexing_client.get_by_id(doc_record["id"])
    if "url" not in doc_indexed:
      self.indexing_client.index(doc_record["id"], doc_url, document_text)

    if doc_record["digest"] != digest:
      self.document_client.update_digest(doc_record["id"], digest)
      self.indexing_client.update_content(doc_record["id"], document_text)

    for link in document_links:
      if self.url_counter < Crawler.MAX_URL:
        self.url_counter += 1
        child_doc_record = self.document_client.get_by_url(link.geturl())
        if "id" not in child_doc_record:
          child_doc_record = self.document_client.create(link.geturl(), "digest")
        self.document_client.create_link(doc_record["id"], child_doc_record["id"])
        self.producer.publish(link.geturl())

    self.pagerank_client.update(doc_record["id"])
Пример #3
0
class MessageBus:
    def __init__(self,
                 broker_url='amqp://localhost',
                 queue_prefix=None,
                 exchange='messagebus'):
        self.broker_url = broker_url
        self.consumer = Consumer(self.broker_url, queue_prefix, exchange)
        self._queue_prefix = queue_prefix
        self.exchange = exchange

    def publish(self, message, payload={}):
        self._publish(message, payload)

    def _publish(self, message, payload, correlation_id=None):
        body = json.dumps(self._prepare_payload(payload), ensure_ascii=False)
        connection = pika.BlockingConnection(
            pika.URLParameters(self.broker_url))
        channel = connection.channel()

        properties = None
        if correlation_id:
            properties = pika.BasicProperties(correlation_id=correlation_id)

        channel.basic_publish(exchange=self.exchange,
                              routing_key=message,
                              body=body,
                              properties=properties)
        connection.close()

    def _prepare_payload(self, payload):
        def serialize(value):
            if isinstance(value, datetime.datetime):
                return value.isoformat()
            return value

        proc_payload = {k: serialize(v) for k, v in payload.items()}
        if 'timestamp' not in proc_payload:
            proc_payload['timestamp'] = datetime.datetime.utcnow().isoformat()
        return proc_payload

    def subscribe(self, message, callback):
        self.consumer.subscribe(message, callback)

    def subscribe_and_publish_response(self, message, callback):
        def subscribe_callback(request_payload, **kwargs):
            correlation_id = kwargs['properties'].correlation_id
            response = callback(request_payload)
            self._publish(message + '.answered', response, correlation_id)

        self.consumer.subscribe(message,
                                subscribe_callback,
                                transient_queue=True)

    def publish_and_get_response(self, message, payload, timeout_secs=5):
        sent_correlation = str(uuid.uuid1())
        consumer_ready = Event()

        def on_consumer_ready():
            consumer_ready.set()

        consumer = Consumer(self.broker_url, self._queue_prefix, self.exchange)
        consumer.on_connection_setup_finished = on_consumer_ready
        response = {}
        response_received = Event()

        def response_callback(response_payload, **kwargs):
            if not sent_correlation == kwargs['properties'].correlation_id:
                return
            response['payload'] = response_payload
            response_received.set()

        def wait_for_response():
            consumer.subscribe(message + '.answered',
                               response_callback,
                               transient_queue=True)
            consumer.start()

        thread = Thread(target=wait_for_response)
        thread.daemon = True
        thread.start()

        consumer_ready.wait(2)
        self._publish(message, payload, correlation_id=sent_correlation)
        timed_out = not response_received.wait(timeout_secs)
        if timed_out:
            raise MessageBusTimeoutError()
        consumer.stop()
        return response.get('payload')

    def start(self):
        self.consumer.start()

    def stop(self):
        self.consumer.stop()