async def submit( request: Request, client: PublisherClient, q: SQLiteAckQueue, topic: str, metadata_headers: Dict[str, str], **kwargs, ) -> response.HTTPResponse: """Deliver request to the pubsub topic. Deliver to the local queue to be retried on transient errors. """ data = request.body attrs = { key: value for key, value in dict( submission_timestamp=datetime.utcnow().isoformat() + "Z", uri=request.path, protocol="HTTP/" + request.version, method=request.method, args=request.query_string, remote_addr=request.ip, host=request.host, **{ attr: request.headers.get(header) for header, attr in metadata_headers.items() }, ).items() if value is not None } # assert valid pubsub message for value in attrs.values(): if len(value.encode("utf8")) > 1024: # attribute exceeds value size limit of 1024 bytes # https://cloud.google.com/pubsub/quotas#resource_limits return response.text( "header too large\n", HTTP_STATUS.REQUEST_HEADER_FIELDS_TOO_LARGE ) try: await client.publish(topic, data, **attrs) except ValueError: return response.text("payload too large\n", HTTP_STATUS.PAYLOAD_TOO_LARGE) except Exception: # api call failure, write to queue logger.exception("pubsub unavailable") try: q.put((topic, data, attrs)) except DatabaseError: logger.exception("queue full") # sqlite queue is probably out of space return response.text("", HTTP_STATUS.INSUFFICIENT_STORAGE) return response.text("")
def _pool_tasks(interval, times_to_run): """ A while that runs forever and check for new tasks on the cassandra DB, every new Task on DB has a 'created' state, so this method looks for all the tasks that have this status. Args: interval: the interval that we should pool cassandra, on every pool; times_to_run: used on tests to determine that the threads will not run forever. [ONLY FOR TESTING] """ times_run = 0 tasks_queue = SQLiteAckQueue(QUEUE_LOCATION) # this while condition will only be checked on testing, otherwise this loop # should run forever. while not times_to_run or times_run < times_to_run: _logger.debug("Calling pool tasks") all_tasks = Task.objects.filter(status=STATUS_CREATED).all() _logger.debug("Found %d tasks to process", len(all_tasks)) for task in all_tasks: try: crawler_name = task.kind if task.options: options = json.loads(task.options) options = options.copy() else: options = {} options["crawler"] = crawler_name options["task_id"] = task.task_id # get the fixed options on the settings that will be aggregated # with the options sent on the table. default_settings = settings.DAVINCI_CONF["crawler-params"].get("default", {}) crawler_settings = settings.DAVINCI_CONF["crawler-params"].get(crawler_name, {}) _add_default_options(options, crawler_settings) _add_default_options(options, default_settings) # fixed options, place here all the fixed options options["current_execution_date"] = datetime.utcnow() params = json.loads(task.params) tasks_queue.put([params, options]) update_task_status(task, STATUS_QUEUED) except Exception as e: update_task_status(task, STATUS_FAULTY, source="crawl command", more_info=traceback.format_exc()) _logger.error("Error while adding params to queue", e) time.sleep(interval) if times_to_run: times_run += 1
def test_init_app(app: Sanic, client: PublisherClient, mocker: MockFixture, q: SQLiteAckQueue): # don't hit actual pubsub publish = mocker.patch.object(client.api, "publish") publish.return_value = PublishResponse(message_ids=["1"]) # listener to create test conditions while sanic is running @app.listener("after_server_start") async def after_server_start(app, _): # stop Sanic app.stop() # queue message to be delivered on shutdown q.put(("topic", b"data", {})) q.put(("topic", b"data", {})) # set required configuration app.config.update(FLUSH_CONCURRENT_BYTES=1, FLUSH_CONCURRENT_MESSAGES=1, FLUSH_SLEEP_SECONDS=0) # configure sanic listeners to handle q in the background flush.init_app(app, client, q) # use a socket to bind to a random port and allow parallel testing sock = socket() sock.bind(("", 0)) # start the app app.run(sock=sock) # make sure everything flushed cleanly assert q.size == 0 assert q.unack_count() == 0 # make sure publish was called the expected number of times assert publish.call_count == 2
def get_queue(config: dict) -> SQLiteAckQueue: """Create a SQLiteAckQueue. Use a SQLiteAckQueue because: * we use acks to ensure messages only removed on success * persist-queue's SQLite*Queue is faster than its Queue * SQLite provides thread-safe and process-safe access """ queue_config = { key[6:].lower(): value for key, value in config.items() if key.startswith("QUEUE_") } return SQLiteAckQueue(**queue_config)
def init_app(app: Sanic) -> Tuple[PublisherClient, SQLiteAckQueue]: """Initialize Sanic app with url rules.""" # Initialize PubSub client timeout = app.config.get("PUBLISH_TIMEOUT_SECONDS", None) client = PublisherClient() client.api.publish = partial( client.api.publish, retry=Retry(TRANSIENT_ERRORS, deadline=timeout), timeout=timeout, ) client._batch_class = AsyncioBatch # Use a SQLiteAckQueue because: # * we use acks to ensure messages only removed on success # * persist-queue's SQLite*Queue is faster than its Queue # * SQLite provides thread-safe and process-safe access queue_config = { key[6:].lower(): value for key, value in app.config.items() if key.startswith("QUEUE_") } q = SQLiteAckQueue(**queue_config) # get metadata_headers config metadata_headers = app.config["METADATA_HEADERS"] # validate attribute keys for attribute in metadata_headers.values(): if len(attribute.encode("utf8")) > 256: # https://cloud.google.com/pubsub/quotas#resource_limits raise ValueError("Metadata attribute exceeds key size limit of 256 bytes") # generate one view_func per topic handlers = { route.topic: partial( submit, client=client, q=q, topic=route.topic, metadata_headers=metadata_headers, ) for route in app.config["ROUTE_TABLE"] } # add routes for ROUTE_TABLE for route in app.config["ROUTE_TABLE"]: app.add_route( handler=handlers[route.topic], uri=route.uri, methods=[method.upper() for method in route.methods], # required because handler.__name__ does not exist # must be a unique name for each handler name="submit_" + route.topic, ) return client, q
def _crawl_params(self): """ Read the multiprocessing queue and call the crawl method to execute the crawling logic. Will run forever than we need to Ctrl+C to finish this. """ times_run = 0 tasks_queue = SQLiteAckQueue(QUEUE_LOCATION) while True: if self.times_to_run and times_run > self.times_to_run: return task_id = None object_queue = None try: object_queue = tasks_queue.get(block=False) crawl_param, options = object_queue crawler_name = options.get("crawler") task_id = options.get("task_id") update_task_status(task_id, STATUS_IN_PROGRESS) _logger.debug("Reading a queue value %s", crawl_param) if "current_execution_date" not in options: options["current_execution_date"] = datetime.utcnow() self._crawl(crawler_name, task_id, crawl_param, options) update_task_status(task_id, STATUS_FINISHED) tasks_queue.ack(object_queue) except Empty: # Means that the queue is empty and we need to count many times # that the occurs to the close logic, we just start counting # when at least the queue received one _logger.debug("No objects found on queue, waiting for 1 " "second and try again") time.sleep(1) except Exception as e: if task_id and object_queue: update_task_status(task_id, STATUS_FAULTY, source="crawl consumer", more_info=traceback.format_exc()) tasks_queue.ack_failed(object_queue) _logger.error("Error while crawling params from queue", e) times_run += 1
def q() -> SQLiteAckQueue: return SQLiteAckQueue(":memory:")
import traceback import logging import time from datetime import datetime from davinci_crawling.management.commands.utils.utils import update_task_status, get_crawler_by_name from davinci_crawling.task.models import STATUS_IN_PROGRESS, STATUS_FAULTY, STATUS_FINISHED from persistqueue import SQLiteAckQueue from persistqueue.exceptions import Empty from threading import Thread _logger = logging.getLogger("davinci_crawling.queue") QUEUE_LOCATION = "tasks_queue" # create the queue for the first time SQLiteAckQueue(QUEUE_LOCATION) class CrawlConsumer(object): """ Initiates a crawl consumer that reads from the multiprocessing queue. It processes the parameters in parallel using a determined quantity of workers. """ consumers = [] def __init__(self, qty_workers=2, times_to_run=None): """ Args: