Пример #1
0
    def __init__(self, nlprp_request: JsonObjectType) -> None:
        """
        Args:
            nlprp_request: dictionary from the (entire) JSON NLPRP request

        Raises:
            :exc:`NlprpError` for malformed requests
        """
        self.nlprp_request = nlprp_request

        args = json_get_toplevel_args(nlprp_request)

        # The processors being requested. We fetch all of them now, so they
        # can be iterated through fast for each document.
        requested_processors = json_get_array(args,
                                              NKeys.PROCESSORS,
                                              required=True)
        self.processors = [
            ServerProcessor.get_processor_nlprp(d)
            for d in requested_processors
        ]

        # Queue?
        self.queue = json_get_bool(args, NKeys.QUEUE, default=False)

        # Client job ID
        self.client_job_id = json_get_str(args,
                                          NKeys.CLIENT_JOB_ID,
                                          default="")

        # Include the source text in the reply?
        self.include_text = json_get_bool(args, NKeys.INCLUDE_TEXT)

        # Content: list of objects (each with text and metadata)
        self.content = json_get_array(args, NKeys.CONTENT, required=True)
Пример #2
0
    def show_queue(self) -> JsonObjectType:
        """
        Finds the queue entries associated with the client, optionally
        restricted to one client job id.
        """
        args = json_get_toplevel_args(self.body, required=False)
        if args:
            client_job_id = json_get_str(args,
                                         NKeys.CLIENT_JOB_ID,
                                         default="",
                                         required=False)
        else:
            client_job_id = ""

        # Queue IDs that are of interest
        queue_id_wheres = [Document.username == self.username
                           ]  # type: List[ClauseElement]  # nopep8
        if client_job_id:
            queue_id_wheres.append(Document.client_job_id == client_job_id)
        # noinspection PyUnresolvedReferences
        queue_ids = fetch_all_first_values(
            dbsession,
            select([Document.queue_id]).select_from(Document.__table__).where(
                and_(*queue_id_wheres)).distinct().order_by(
                    Document.queue_id))  # type: List[str]

        queue_answer = []  # type: JsonArrayType
        for queue_id in queue_ids:
            # DocProcRequest objects that are of interest
            dprs = list(
                dbsession.query(DocProcRequest).join(Document).filter(
                    Document.queue_id ==
                    queue_id).all())  # type: List[DocProcRequest]
            busy = not all([dpr.done for dpr in dprs])
            if busy:
                max_time = datetime.datetime.min
            else:
                max_time = max([dpr.when_done_utc for dpr in dprs])
            assert dprs, "No DocProcRequests found; bug?"
            dt_submitted = dprs[0].document.datetime_submitted_pendulum

            queue_answer.append({
                NKeys.QUEUE_ID:
                queue_id,
                NKeys.CLIENT_JOB_ID:
                client_job_id,
                NKeys.STATUS:
                NlprpValues.BUSY if busy else NlprpValues.READY,
                NKeys.DATETIME_SUBMITTED:
                pendulum_to_nlprp_datetime(dt_submitted, to_utc=True),
                NKeys.DATETIME_COMPLETED:
                (None if busy else pendulum_to_nlprp_datetime(max_time,
                                                              to_utc=True))
            })
        return self.create_response(status=HttpStatus.OK,
                                    extra_info={NKeys.QUEUE: queue_answer})
Пример #3
0
    def delete_from_queue(self) -> JsonObjectType:
        """
        Deletes from the queue all entries specified by the client.
        """
        args = json_get_toplevel_args(self.body)
        delete_all = json_get_bool(args, NKeys.DELETE_ALL, default=False)
        client_job_ids = json_get_array_of_str(args, NKeys.CLIENT_JOB_IDS)

        # Establish what to cancel/delete
        q_dpr = (dbsession.query(DocProcRequest).join(Document).filter(
            Document.username == self.username))
        if not delete_all:
            q_dpr = q_dpr.filter(Document.client_job_id.in_(client_job_ids))

        # Remove from Celery queue (cancel ongoing jobs)
        task_ids_to_cancel = [dpr.docprocrequest_id for dpr in q_dpr.all()]
        # Quicker to use ResultSet than forget them all separately
        results = []  # type: List[AsyncResult]
        for task_id in task_ids_to_cancel:
            results.append(AsyncResult(id=task_id, app=celery_app))
        res_set = ResultSet(results=results, app=celery_app)
        log.debug("About to revoke jobs...")
        res_set.revoke()  # will hang if backend not operational
        log.debug("... jobs revoked.")

        q_docs = (dbsession.query(Document).filter(
            Document.username == self.username))
        if not delete_all:
            q_docs = q_docs.filter(Document.client_job_id.in_(client_job_ids))

        with sqla_transaction_commit():
            # Delete the Document objects, which will cascade-delete the
            # DocProcRequest objects.
            q_docs.delete(synchronize_session=False)

        # Return response
        return self.create_response(status=HttpStatus.OK)
Пример #4
0
    def fetch_from_queue(self) -> JsonObjectType:
        """
        Fetches requests for all document-processor pairs for the queue_id
        supplied by the user (if complete).
        """
        # ---------------------------------------------------------------------
        # Args
        # ---------------------------------------------------------------------
        args = json_get_toplevel_args(self.body)
        queue_id = json_get_str(args, NKeys.QUEUE_ID, required=True)

        # ---------------------------------------------------------------------
        # Start with the DocProcRequests, because if some are still busy,
        # we will return a "busy" response.
        # ---------------------------------------------------------------------
        q_dpr = (dbsession.query(DocProcRequest).join(Document).filter(
            Document.username == self.username).filter(
                Document.queue_id == queue_id))
        q_doc = (dbsession.query(Document).filter(
            Document.username == self.username).filter(
                Document.queue_id == queue_id))
        dprs = list(q_dpr.all())  # type: List[DocProcRequest]
        if not dprs:
            raise mkerror(NOT_FOUND, "The queue_id given was not found")
        busy = not all([dpr.done for dpr in dprs])
        if busy:
            response = self.create_response(HttpStatus.PROCESSING, {})
            # todo: is it correct (from previous comments) that we can't
            # return JSON via Pyramid with a status of HttpStatus.PROCESSING?
            # If that's true, we have to force as below, but then we need to
            # alter the NLPRP docs (as these state the JSON code and HTTP code
            # should always be the same).
            self.set_http_response_status(HttpStatus.OK)
            return response

        # ---------------------------------------------------------------------
        # Make it easy to look up processors
        # ---------------------------------------------------------------------

        processor_cache = {}  # type: Dict[str, ServerProcessor]

        def get_processor_cached(_processor_id: str) -> ServerProcessor:
            """
            Cache lookups for speed. (All documents will share the same set
            of processors, so there'll be a fair bit of duplication.)
            """
            nonlocal processor_cache
            try:
                return processor_cache[_processor_id]
            except KeyError:
                _processor = ServerProcessor.get_processor_from_id(
                    _processor_id)  # may raise  # noqa
                processor_cache[_processor_id] = _processor
                return _processor

        # ---------------------------------------------------------------------
        # Collect results by document
        # ---------------------------------------------------------------------

        doc_results = []  # type: JsonArrayType
        client_job_id = None  # type: Optional[str]
        docs = set(dpr.document for dpr in dprs)
        for doc in docs:
            if client_job_id is None:
                client_job_id = doc.client_job_id
            processor_data = []  # type: JsonArrayType
            # ... data for *all* the processors for this doc
            for dpr in doc.docprocrequests:
                procresult = json.loads(dpr.results)  # type: Dict[str, Any]
                if procresult[NKeys.NAME] is None:
                    processor = get_processor_cached(dpr.processor_id)
                    procresult[NKeys.NAME] = processor.name
                    procresult[NKeys.TITLE] = processor.title
                    procresult[NKeys.VERSION] = processor.version
                processor_data.append(procresult)
            metadata = json.loads(doc.client_metadata)
            doc_result = {
                NKeys.METADATA: metadata,
                NKeys.PROCESSORS: processor_data
            }
            if doc.include_text:
                doc_result[NKeys.TEXT] = doc.doctext
            doc_results.append(doc_result)

        # ---------------------------------------------------------------------
        # Delete leftovers
        # ---------------------------------------------------------------------

        with sqla_transaction_commit():
            q_doc.delete(synchronize_session=False)
            # ... will also delete the DocProcRequests via a cascade

        response_info = {
            NKeys.CLIENT_JOB_ID:
            (client_job_id if client_job_id is not None else ""),
            NKeys.RESULTS:
            doc_results
        }
        return self.create_response(status=HttpStatus.OK,
                                    extra_info=response_info)