Пример #1
0
async def submit(
    request: Request,
    client: PublisherClient,
    q: SQLiteAckQueue,
    topic: str,
    metadata_headers: Dict[str, str],
    **kwargs,
) -> response.HTTPResponse:
    """Deliver request to the pubsub topic.

    Deliver to the local queue to be retried on transient errors.
    """
    data = request.body
    attrs = {
        key: value
        for key, value in dict(
            submission_timestamp=datetime.utcnow().isoformat() + "Z",
            uri=request.path,
            protocol="HTTP/" + request.version,
            method=request.method,
            args=request.query_string,
            remote_addr=request.ip,
            host=request.host,
            **{
                attr: request.headers.get(header)
                for header, attr in metadata_headers.items()
            },
        ).items()
        if value is not None
    }
    # assert valid pubsub message
    for value in attrs.values():
        if len(value.encode("utf8")) > 1024:
            # attribute exceeds value size limit of 1024 bytes
            # https://cloud.google.com/pubsub/quotas#resource_limits
            return response.text(
                "header too large\n", HTTP_STATUS.REQUEST_HEADER_FIELDS_TOO_LARGE
            )
    try:
        await client.publish(topic, data, **attrs)
    except ValueError:
        return response.text("payload too large\n", HTTP_STATUS.PAYLOAD_TOO_LARGE)
    except Exception:
        # api call failure, write to queue
        logger.exception("pubsub unavailable")
        try:
            q.put((topic, data, attrs))
        except DatabaseError:
            logger.exception("queue full")
            # sqlite queue is probably out of space
            return response.text("", HTTP_STATUS.INSUFFICIENT_STORAGE)
    return response.text("")
Пример #2
0
def _pool_tasks(interval, times_to_run):
    """
    A while that runs forever and check for new tasks on the cassandra DB,
    every new Task on DB has a 'created' state, so this method looks for all
    the tasks that have this status.
    Args:
        interval: the interval that we should pool cassandra, on every pool;
        times_to_run: used on tests to determine that the threads will not run
        forever. [ONLY FOR TESTING]
    """
    times_run = 0
    tasks_queue = SQLiteAckQueue(QUEUE_LOCATION)
    # this while condition will only be checked on testing, otherwise this loop
    # should run forever.
    while not times_to_run or times_run < times_to_run:
        _logger.debug("Calling pool tasks")
        all_tasks = Task.objects.filter(status=STATUS_CREATED).all()
        _logger.debug("Found %d tasks to process", len(all_tasks))
        for task in all_tasks:
            try:
                crawler_name = task.kind
                if task.options:
                    options = json.loads(task.options)
                    options = options.copy()
                else:
                    options = {}
                options["crawler"] = crawler_name
                options["task_id"] = task.task_id

                # get the fixed options on the settings that will be aggregated
                # with the options sent on the table.
                default_settings = settings.DAVINCI_CONF["crawler-params"].get("default", {})
                crawler_settings = settings.DAVINCI_CONF["crawler-params"].get(crawler_name, {})

                _add_default_options(options, crawler_settings)
                _add_default_options(options, default_settings)

                # fixed options, place here all the fixed options
                options["current_execution_date"] = datetime.utcnow()

                params = json.loads(task.params)

                tasks_queue.put([params, options])
                update_task_status(task, STATUS_QUEUED)
            except Exception as e:
                update_task_status(task, STATUS_FAULTY, source="crawl command", more_info=traceback.format_exc())
                _logger.error("Error while adding params to queue", e)
        time.sleep(interval)
        if times_to_run:
            times_run += 1