예제 #1
0
    def __init__(self, nlprp_request: JsonObjectType) -> None:
        """
        Args:
            nlprp_request: dictionary from the (entire) JSON NLPRP request

        Raises:
            :exc:`NlprpError` for malformed requests
        """
        self.nlprp_request = nlprp_request

        args = json_get_toplevel_args(nlprp_request)

        # The processors being requested. We fetch all of them now, so they
        # can be iterated through fast for each document.
        requested_processors = json_get_array(args,
                                              NKeys.PROCESSORS,
                                              required=True)
        self.processors = [
            ServerProcessor.get_processor_nlprp(d)
            for d in requested_processors
        ]

        # Queue?
        self.queue = json_get_bool(args, NKeys.QUEUE, default=False)

        # Client job ID
        self.client_job_id = json_get_str(args,
                                          NKeys.CLIENT_JOB_ID,
                                          default="")

        # Include the source text in the reply?
        self.include_text = json_get_bool(args, NKeys.INCLUDE_TEXT)

        # Content: list of objects (each with text and metadata)
        self.content = json_get_array(args, NKeys.CONTENT, required=True)
예제 #2
0
    def show_queue(self) -> JsonObjectType:
        """
        Finds the queue entries associated with the client, optionally
        restricted to one client job id.
        """
        args = json_get_toplevel_args(self.body, required=False)
        if args:
            client_job_id = json_get_str(args,
                                         NKeys.CLIENT_JOB_ID,
                                         default="",
                                         required=False)
        else:
            client_job_id = ""

        # Queue IDs that are of interest
        queue_id_wheres = [Document.username == self.username
                           ]  # type: List[ClauseElement]  # nopep8
        if client_job_id:
            queue_id_wheres.append(Document.client_job_id == client_job_id)
        # noinspection PyUnresolvedReferences
        queue_ids = fetch_all_first_values(
            dbsession,
            select([Document.queue_id]).select_from(Document.__table__).where(
                and_(*queue_id_wheres)).distinct().order_by(
                    Document.queue_id))  # type: List[str]

        queue_answer = []  # type: JsonArrayType
        for queue_id in queue_ids:
            # DocProcRequest objects that are of interest
            dprs = list(
                dbsession.query(DocProcRequest).join(Document).filter(
                    Document.queue_id ==
                    queue_id).all())  # type: List[DocProcRequest]
            busy = not all([dpr.done for dpr in dprs])
            if busy:
                max_time = datetime.datetime.min
            else:
                max_time = max([dpr.when_done_utc for dpr in dprs])
            assert dprs, "No DocProcRequests found; bug?"
            dt_submitted = dprs[0].document.datetime_submitted_pendulum

            queue_answer.append({
                NKeys.QUEUE_ID:
                queue_id,
                NKeys.CLIENT_JOB_ID:
                client_job_id,
                NKeys.STATUS:
                NlprpValues.BUSY if busy else NlprpValues.READY,
                NKeys.DATETIME_SUBMITTED:
                pendulum_to_nlprp_datetime(dt_submitted, to_utc=True),
                NKeys.DATETIME_COMPLETED:
                (None if busy else pendulum_to_nlprp_datetime(max_time,
                                                              to_utc=True))
            })
        return self.create_response(status=HttpStatus.OK,
                                    extra_info={NKeys.QUEUE: queue_answer})
예제 #3
0
 def handle_nlprp_request(self) -> JsonObjectType:
     """
     The main function. Authenticates user and checks the request is not
     malformed, then calls the function relating to the command specified
     by the user.
     """
     self._authenticate()
     self._set_body_json_from_request()
     command = json_get_str(self.body, NKeys.COMMAND, required=True)
     log.debug(f"NLPRP request received from {self.request.remote_addr}: "
               f"username={self.username}, command={command}")
     if DEBUG_SHOW_REQUESTS:
         log.debug(f"Request: {self.body!r}")
     return self.parse_command(command)
예제 #4
0
    def gen_text_metadataobj(
            self) -> Generator[Tuple[str, JsonValueType], None, None]:
        """
        Generates text and metadata pairs from the request, with the metadata
        in JSON object (Python dictionary) format.

        Yields:
            tuple: ``(text, metadata)``, as above
        """
        for document in self.content:
            text = json_get_str(document, NKeys.TEXT, required=True)
            metadata = json_get_value(document,
                                      NKeys.METADATA,
                                      default=None,
                                      required=False)
            yield text, metadata
예제 #5
0
    def gen_text_metadatastr(self) -> Generator[Tuple[str, str], None, None]:
        """
        Generates text and metadata pairs from the request, with the metadata
        in string (serialized JSON) format.

        Yields:
            tuple: ``(text, metadata)``, as above
        """
        try:
            for document in self.content:
                text = json_get_str(document, NKeys.TEXT, required=True)
                metadata = json_get_value(document,
                                          NKeys.METADATA,
                                          default=None,
                                          required=False)
                metadata_str = json.dumps(metadata,
                                          separators=JSON_SEPARATORS_COMPACT)
                yield text, metadata_str
        except KeyError:
            raise key_missing_error(key=NKeys.TEXT)
예제 #6
0
    def fetch_from_queue(self) -> JsonObjectType:
        """
        Fetches requests for all document-processor pairs for the queue_id
        supplied by the user (if complete).
        """
        # ---------------------------------------------------------------------
        # Args
        # ---------------------------------------------------------------------
        args = json_get_toplevel_args(self.body)
        queue_id = json_get_str(args, NKeys.QUEUE_ID, required=True)

        # ---------------------------------------------------------------------
        # Start with the DocProcRequests, because if some are still busy,
        # we will return a "busy" response.
        # ---------------------------------------------------------------------
        q_dpr = (dbsession.query(DocProcRequest).join(Document).filter(
            Document.username == self.username).filter(
                Document.queue_id == queue_id))
        q_doc = (dbsession.query(Document).filter(
            Document.username == self.username).filter(
                Document.queue_id == queue_id))
        dprs = list(q_dpr.all())  # type: List[DocProcRequest]
        if not dprs:
            raise mkerror(NOT_FOUND, "The queue_id given was not found")
        busy = not all([dpr.done for dpr in dprs])
        if busy:
            response = self.create_response(HttpStatus.PROCESSING, {})
            # todo: is it correct (from previous comments) that we can't
            # return JSON via Pyramid with a status of HttpStatus.PROCESSING?
            # If that's true, we have to force as below, but then we need to
            # alter the NLPRP docs (as these state the JSON code and HTTP code
            # should always be the same).
            self.set_http_response_status(HttpStatus.OK)
            return response

        # ---------------------------------------------------------------------
        # Make it easy to look up processors
        # ---------------------------------------------------------------------

        processor_cache = {}  # type: Dict[str, ServerProcessor]

        def get_processor_cached(_processor_id: str) -> ServerProcessor:
            """
            Cache lookups for speed. (All documents will share the same set
            of processors, so there'll be a fair bit of duplication.)
            """
            nonlocal processor_cache
            try:
                return processor_cache[_processor_id]
            except KeyError:
                _processor = ServerProcessor.get_processor_from_id(
                    _processor_id)  # may raise  # noqa
                processor_cache[_processor_id] = _processor
                return _processor

        # ---------------------------------------------------------------------
        # Collect results by document
        # ---------------------------------------------------------------------

        doc_results = []  # type: JsonArrayType
        client_job_id = None  # type: Optional[str]
        docs = set(dpr.document for dpr in dprs)
        for doc in docs:
            if client_job_id is None:
                client_job_id = doc.client_job_id
            processor_data = []  # type: JsonArrayType
            # ... data for *all* the processors for this doc
            for dpr in doc.docprocrequests:
                procresult = json.loads(dpr.results)  # type: Dict[str, Any]
                if procresult[NKeys.NAME] is None:
                    processor = get_processor_cached(dpr.processor_id)
                    procresult[NKeys.NAME] = processor.name
                    procresult[NKeys.TITLE] = processor.title
                    procresult[NKeys.VERSION] = processor.version
                processor_data.append(procresult)
            metadata = json.loads(doc.client_metadata)
            doc_result = {
                NKeys.METADATA: metadata,
                NKeys.PROCESSORS: processor_data
            }
            if doc.include_text:
                doc_result[NKeys.TEXT] = doc.doctext
            doc_results.append(doc_result)

        # ---------------------------------------------------------------------
        # Delete leftovers
        # ---------------------------------------------------------------------

        with sqla_transaction_commit():
            q_doc.delete(synchronize_session=False)
            # ... will also delete the DocProcRequests via a cascade

        response_info = {
            NKeys.CLIENT_JOB_ID:
            (client_job_id if client_job_id is not None else ""),
            NKeys.RESULTS:
            doc_results
        }
        return self.create_response(status=HttpStatus.OK,
                                    extra_info=response_info)